From c095329f5ab531869ccda152f7493d65c3c48c7b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 7 Jun 2021 13:36:07 -0700 Subject: [PATCH 001/392] Add Algorithm SCAN --- src/CMakeLists.txt | 2 + src/algorithm/CMakeLists.txt | 7 ++- src/algorithm/SCAN-Cuda.cpp | 70 +++++++++++++++++++++++++++ src/algorithm/SCAN-Hip.cpp | 70 +++++++++++++++++++++++++++ src/algorithm/SCAN-OMP.cpp | 56 ++++++++++++++++++++++ src/algorithm/SCAN-Seq.cpp | 91 ++++++++++++++++++++++++++++++++++++ src/algorithm/SCAN.cpp | 61 ++++++++++++++++++++++++ src/algorithm/SCAN.hpp | 75 +++++++++++++++++++++++++++++ src/common/RAJAPerfSuite.cpp | 6 +++ src/common/RAJAPerfSuite.hpp | 1 + 10 files changed, 438 insertions(+), 1 deletion(-) create mode 100644 src/algorithm/SCAN-Cuda.cpp create mode 100644 src/algorithm/SCAN-Hip.cpp create mode 100644 src/algorithm/SCAN-OMP.cpp create mode 100644 src/algorithm/SCAN-Seq.cpp create mode 100644 src/algorithm/SCAN.cpp create mode 100644 src/algorithm/SCAN.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 47073d63e..a767617b0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -188,6 +188,8 @@ blt_add_executable( common/RAJAPerfSuite.cpp common/RPTypes.hpp common/RunParams.cpp + algorithm/SCAN.cpp + algorithm/SCAN-Seq.cpp algorithm/SORT.cpp algorithm/SORT-Seq.cpp algorithm/SORTPAIRS.cpp diff --git a/src/algorithm/CMakeLists.txt b/src/algorithm/CMakeLists.txt index 067e40e51..312662a14 100644 --- a/src/algorithm/CMakeLists.txt +++ b/src/algorithm/CMakeLists.txt @@ -8,7 +8,12 @@ blt_add_library( NAME algorithm - SOURCES SORT.cpp + SOURCES SCAN.cpp + SCAN-Seq.cpp + SCAN-Hip.cpp + SCAN-Cuda.cpp + SCAN-OMP.cpp + SORT.cpp SORT-Seq.cpp SORT-Hip.cpp SORT-Cuda.cpp diff --git a/src/algorithm/SCAN-Cuda.cpp b/src/algorithm/SCAN-Cuda.cpp new file mode 100644 index 000000000..01f888b51 --- /dev/null +++ b/src/algorithm/SCAN-Cuda.cpp @@ -0,0 +1,70 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "SCAN.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + + // + // Define thread block size for CUDA execution + // + const size_t block_size = 256; + + +#define SCAN_DATA_SETUP_CUDA \ + allocAndInitCudaDeviceData(x, m_x, iend); \ + allocAndInitCudaDeviceData(y, m_y, iend); + +#define SCAN_DATA_TEARDOWN_CUDA \ + getCudaDeviceData(m_y, y, iend); \ + deallocCudaDeviceData(x); \ + deallocCudaDeviceData(y); + + +void SCAN::runCudaVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + SCAN_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + SCAN_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::exclusive_scan< RAJA::cuda_exec >(RAJA_SCAN_ARGS); + + } + stopTimer(); + + SCAN_DATA_TEARDOWN_CUDA; + + } else { + std::cout << "\n SCAN : Unknown Cuda variant id = " << vid << std::endl; + } +} + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/algorithm/SCAN-Hip.cpp b/src/algorithm/SCAN-Hip.cpp new file mode 100644 index 000000000..47a9789af --- /dev/null +++ b/src/algorithm/SCAN-Hip.cpp @@ -0,0 +1,70 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "SCAN.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + + // + // Define thread block size for HIP execution + // + const size_t block_size = 256; + + +#define SCAN_DATA_SETUP_HIP \ + allocAndInitHipDeviceData(x, m_x, iend*run_reps); \ + allocAndInitHipDeviceData(y, m_y, iend*run_reps); + +#define SCAN_DATA_TEARDOWN_HIP \ + getHipDeviceData(m_y, y, iend*run_reps); \ + deallocHipDeviceData(x); \ + deallocHipDeviceData(y); + + +void SCAN::runHipVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + SCAN_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + SCAN_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::exclusive_scan< RAJA::hip_exec >(RAJA_SCAN_ARGS); + + } + stopTimer(); + + SCAN_DATA_TEARDOWN_HIP; + + } else { + std::cout << "\n SCAN : Unknown Hip variant id = " << vid << std::endl; + } +} + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/algorithm/SCAN-OMP.cpp b/src/algorithm/SCAN-OMP.cpp new file mode 100644 index 000000000..b93e40096 --- /dev/null +++ b/src/algorithm/SCAN-OMP.cpp @@ -0,0 +1,56 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "SCAN.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + + +void SCAN::runOpenMPVariant(VariantID vid) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + SCAN_DATA_SETUP; + + switch ( vid ) { + + case RAJA_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::exclusive_scan(RAJA_SCAN_ARGS); + + } + stopTimer(); + + break; + } + + default : { + std::cout << "\n SCAN : Unknown variant id = " << vid << std::endl; + } + + } + +#endif +} + +} // end namespace algorithm +} // end namespace rajaperf diff --git a/src/algorithm/SCAN-Seq.cpp b/src/algorithm/SCAN-Seq.cpp new file mode 100644 index 000000000..cbb6e89cd --- /dev/null +++ b/src/algorithm/SCAN-Seq.cpp @@ -0,0 +1,91 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "SCAN.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + + +void SCAN::runSeqVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + SCAN_DATA_SETUP; + + auto scan_lam = [=](Index_type i) { + SCAN_BODY; + }; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + SCAN_PROLOGUE; + for (Index_type i = ibegin+1; i < iend; ++i ) { + SCAN_BODY; + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + SCAN_PROLOGUE; + for (Index_type i = ibegin+1; i < iend; ++i ) { + scan_lam(i); + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::exclusive_scan(RAJA_SCAN_ARGS); + + } + stopTimer(); + + break; + } +#endif + + default : { + std::cout << "\n SCAN : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace algorithm +} // end namespace rajaperf diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp new file mode 100644 index 000000000..037cca475 --- /dev/null +++ b/src/algorithm/SCAN.cpp @@ -0,0 +1,61 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "SCAN.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/DataUtils.hpp" + +namespace rajaperf +{ +namespace algorithm +{ + + +SCAN::SCAN(const RunParams& params) + : KernelBase(rajaperf::Algorithm_SCAN, params) +{ + setDefaultSize(100000); + setDefaultReps(50); + + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( RAJA_HIP ); +} + +SCAN::~SCAN() +{ +} + +void SCAN::setUp(VariantID vid) +{ + allocAndInitDataRandValue(m_x, getRunSize(), vid); + allocAndInitDataConst(m_y, getRunSize(), 0.0, vid); +} + +void SCAN::updateChecksum(VariantID vid) +{ + checksum[vid] += calcChecksum(m_y, getRunSize()); +} + +void SCAN::tearDown(VariantID vid) +{ + (void) vid; + deallocData(m_x); + deallocData(m_y); +} + +} // end namespace algorithm +} // end namespace rajaperf diff --git a/src/algorithm/SCAN.hpp b/src/algorithm/SCAN.hpp new file mode 100644 index 000000000..b1d1c53fa --- /dev/null +++ b/src/algorithm/SCAN.hpp @@ -0,0 +1,75 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// SCAN kernel reference implementation: +/// +/// // exclusive scan +/// y[ibegin] = 0; +/// for (Index_type i = ibegin+1; i < iend; ++i) { +/// y[i] = y[i-1] + x[i-1]; +/// } +/// + +#ifndef RAJAPerf_Algorithm_SCAN_HPP +#define RAJAPerf_Algorithm_SCAN_HPP + +#define SCAN_DATA_SETUP \ + Real_ptr x = m_x; \ + Real_ptr y = m_y; + +#define SCAN_PROLOGUE \ + y[ibegin] = 0.0; + +#define SCAN_BODY \ + y[i] = y[i-1] + x[i-1]; + +#define RAJA_SCAN_ARGS \ + RAJA::make_span(x + ibegin, iend - ibegin), \ + RAJA::make_span(y + ibegin, iend - ibegin) + + +#include "common/KernelBase.hpp" + +namespace rajaperf +{ +class RunParams; + +namespace algorithm +{ + +class SCAN : public KernelBase +{ +public: + + SCAN(const RunParams& params); + + ~SCAN(); + + void setUp(VariantID vid); + void updateChecksum(VariantID vid); + void tearDown(VariantID vid); + + void runSeqVariant(VariantID vid); + void runOpenMPVariant(VariantID vid); + void runCudaVariant(VariantID vid); + void runHipVariant(VariantID vid); + void runOpenMPTargetVariant(VariantID vid) + { + std::cout << "\n SCAN : Unknown OMP Target variant id = " << vid << std::endl; + } + +private: + Real_ptr m_x; + Real_ptr m_y; +}; + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 7b0b02a04..10eb7c29b 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -83,6 +83,7 @@ // // Algorithm kernels... // +#include "algorithm/SCAN.hpp" #include "algorithm/SORT.hpp" #include "algorithm/SORTPAIRS.hpp" @@ -206,6 +207,7 @@ static const std::string KernelNames [] = // // Algorithm kernels... // + std::string("Algorithm_SCAN"), std::string("Algorithm_SORT"), std::string("Algorithm_SORTPAIRS"), @@ -599,6 +601,10 @@ KernelBase* getKernelObject(KernelID kid, // // Algorithm kernels... // + case Algorithm_SCAN: { + kernel = new algorithm::SCAN(run_params); + break; + } case Algorithm_SORT: { kernel = new algorithm::SORT(run_params); break; diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 15551bb9d..d6609e460 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -169,6 +169,7 @@ enum KernelID { // // Algorithm kernels... // + Algorithm_SCAN, Algorithm_SORT, Algorithm_SORTPAIRS, From 87fd2cb37aba0b65903998430d1cdd5c475272fb Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 7 Jun 2021 15:59:49 -0700 Subject: [PATCH 002/392] Add more options to allocate data and checksum --- src/common/DataUtils.cpp | 92 ++++++++++++++++++++++++++-------------- src/common/DataUtils.hpp | 35 +++++++++------ 2 files changed, 82 insertions(+), 45 deletions(-) diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index a5be0b5ca..3bb17a8f4 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -40,8 +40,7 @@ void incDataInitCount() */ void allocAndInitData(Int_ptr& ptr, int len, VariantID vid) { - // Should we do this differently for alignment?? If so, change dealloc() - ptr = new Int_type[len]; + allocData(ptr, len); initData(ptr, len, vid); } @@ -50,44 +49,56 @@ void allocAndInitData(Int_ptr& ptr, int len, VariantID vid) */ void allocAndInitData(Real_ptr& ptr, int len, VariantID vid ) { - ptr = - RAJA::allocate_aligned_type(RAJA::DATA_ALIGN, - len*sizeof(Real_type)); + allocData(ptr, len); initData(ptr, len, vid); } void allocAndInitDataConst(Real_ptr& ptr, int len, Real_type val, VariantID vid) { - (void) vid; - - ptr = - RAJA::allocate_aligned_type(RAJA::DATA_ALIGN, - len*sizeof(Real_type)); + allocData(ptr, len); initDataConst(ptr, len, val, vid); } void allocAndInitDataRandSign(Real_ptr& ptr, int len, VariantID vid) { - ptr = - RAJA::allocate_aligned_type(RAJA::DATA_ALIGN, - len*sizeof(Real_type)); + allocData(ptr, len); initDataRandSign(ptr, len, vid); } void allocAndInitDataRandValue(Real_ptr& ptr, int len, VariantID vid) +{ + allocData(ptr, len); + initDataRandValue(ptr, len, vid); +} + +void allocAndInitData(Complex_ptr& ptr, int len, VariantID vid) +{ + allocData(ptr, len); + initData(ptr, len, vid); +} + + +/* + * Allocate data arrays of given type. + */ +void allocData(Int_ptr& ptr, int len) +{ + // Should we do this differently for alignment?? If so, change dealloc() + ptr = new Int_type[len]; +} + +void allocData(Real_ptr& ptr, int len) { ptr = RAJA::allocate_aligned_type(RAJA::DATA_ALIGN, len*sizeof(Real_type)); - initDataRandValue(ptr, len, vid); } -void allocAndInitData(Complex_ptr& ptr, int len, VariantID vid) +void allocData(Complex_ptr& ptr, int len) { // Should we do this differently for alignment?? If so, change dealloc() ptr = new Complex_type[len]; - initData(ptr, len, vid); } @@ -95,7 +106,7 @@ void allocAndInitData(Complex_ptr& ptr, int len, VariantID vid) * Free data arrays of given type. */ void deallocData(Int_ptr& ptr) -{ +{ if (ptr) { delete [] ptr; ptr = 0; @@ -103,7 +114,7 @@ void deallocData(Int_ptr& ptr) } void deallocData(Real_ptr& ptr) -{ +{ if (ptr) { RAJA::free_aligned(ptr); ptr = 0; @@ -112,7 +123,7 @@ void deallocData(Real_ptr& ptr) void deallocData(Complex_ptr& ptr) { - if (ptr) { + if (ptr) { delete [] ptr; ptr = 0; } @@ -120,7 +131,7 @@ void deallocData(Complex_ptr& ptr) /* - * \brief Initialize Int_type data array to + * \brief Initialize Int_type data array to * randomly signed positive and negative values. */ void initData(Int_ptr& ptr, int len, VariantID vid) @@ -148,11 +159,11 @@ void initData(Int_ptr& ptr, int len, VariantID vid) ptr[i] = ( signfact < 0.5 ? -1 : 1 ); }; - signfact = Real_type(rand())/RAND_MAX; + signfact = Real_type(rand())/RAND_MAX; Int_type ilo = len * signfact; ptr[ilo] = -58; - signfact = Real_type(rand())/RAND_MAX; + signfact = Real_type(rand())/RAND_MAX; Int_type ihi = len * signfact; ptr[ihi] = 19; @@ -160,11 +171,11 @@ void initData(Int_ptr& ptr, int len, VariantID vid) } /* - * Initialize Real_type data array to non-random - * positive values (0.0, 1.0) based on their array position + * Initialize Real_type data array to non-random + * positive values (0.0, 1.0) based on their array position * (index) and the order in which this method is called. */ -void initData(Real_ptr& ptr, int len, VariantID vid) +void initData(Real_ptr& ptr, int len, VariantID vid) { (void) vid; @@ -172,19 +183,19 @@ void initData(Real_ptr& ptr, int len, VariantID vid) // first touch... #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - if ( vid == Base_OpenMP || + if ( vid == Base_OpenMP || vid == Lambda_OpenMP || vid == RAJA_OpenMP ) { #pragma omp parallel for - for (int i = 0; i < len; ++i) { + for (int i = 0; i < len; ++i) { ptr[i] = factor*(i + 1.1)/(i + 1.12345); }; - } + } #endif for (int i = 0; i < len; ++i) { ptr[i] = factor*(i + 1.1)/(i + 1.12345); - } + } incDataInitCount(); } @@ -193,7 +204,7 @@ void initData(Real_ptr& ptr, int len, VariantID vid) * Initialize Real_type data array to constant values. */ void initDataConst(Real_ptr& ptr, int len, Real_type val, - VariantID vid) + VariantID vid) { // first touch... @@ -289,10 +300,10 @@ void initData(Complex_ptr& ptr, int len, VariantID vid) #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) if ( vid == Base_OpenMP || - vid == Lambda_OpenMP || + vid == Lambda_OpenMP || vid == RAJA_OpenMP ) { #pragma omp parallel for - for (int i = 0; i < len; ++i) { + for (int i = 0; i < len; ++i) { ptr[i] = factor*(i + 1.1)/(i + 1.12345); }; } @@ -322,7 +333,22 @@ void initData(Real_type& d, VariantID vid) /* * Calculate and return checksum for data arrays. */ -long double calcChecksum(const Real_ptr ptr, int len, +long double calcChecksum(const Int_ptr ptr, int len, + Real_type scale_factor) +{ + long double tchk = 0.0; + for (Index_type j = 0; j < len; ++j) { + tchk += (j+1)*ptr[j]*scale_factor; +#if 0 // RDH DEBUG + if ( (j % 100) == 0 ) { + std::cout << "j : tchk = " << j << " : " << tchk << std::endl; + } +#endif + } + return tchk; +} + +long double calcChecksum(const Real_ptr ptr, int len, Real_type scale_factor) { long double tchk = 0.0; diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index e52e99148..2e6b0d050 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -31,7 +31,7 @@ namespace rajaperf { - + /*! * Reset counter for data initialization. */ @@ -45,7 +45,7 @@ void incDataInitCount(); /*! * \brief Allocate and initialize Int_type data array. - * + * * Array is initialized using method initData(Int_ptr& ptr...) below. */ void allocAndInitData(Int_ptr& ptr, int len, @@ -61,8 +61,8 @@ void allocAndInitData(Real_ptr& ptr, int len, /*! * \brief Allocate and initialize aligned Real_type data array. - * - * Array entries are initialized using the method + * + * Array entries are initialized using the method * initDataConst(Real_ptr& ptr...) below. */ void allocAndInitDataConst(Real_ptr& ptr, int len, Real_type val, @@ -91,6 +91,14 @@ void allocAndInitDataRandValue(Real_ptr& ptr, int len, void allocAndInitData(Complex_ptr& ptr, int len, VariantID vid = NumVariants); +/*! + * \brief Allocate data arrays. + */ +void allocData(Int_ptr& ptr, int len); +/// +void allocData(Real_ptr& ptr, int len); +/// +void allocData(Complex_ptr& ptr, int len); /*! * \brief Free data arrays. @@ -104,9 +112,9 @@ void deallocData(Complex_ptr& ptr); /*! * \brief Initialize Int_type data array. - * + * * Array entries are randomly initialized to +/-1. - * Then, two randomly-chosen entries are reset, one to + * Then, two randomly-chosen entries are reset, one to * a value > 1, one to a value < -1. */ void initData(Int_ptr& ptr, int len, @@ -132,8 +140,8 @@ void initDataConst(Real_ptr& ptr, int len, Real_type val, /*! * \brief Initialize Real_type data array with random sign. - * - * Array entries are initialized in the same way as the method + * + * Array entries are initialized in the same way as the method * initData(Real_ptr& ptr...) above, but with random sign. */ void initDataRandSign(Real_ptr& ptr, int len, @@ -150,7 +158,7 @@ void initDataRandValue(Real_ptr& ptr, int len, /*! * \brief Initialize Complex_type data array. * - * Real and imaginary array entries are initialized in the same way as the + * Real and imaginary array entries are initialized in the same way as the * method allocAndInitData(Real_ptr& ptr...) above. */ void initData(Complex_ptr& ptr, int len, @@ -159,7 +167,7 @@ void initData(Complex_ptr& ptr, int len, /*! * \brief Initialize Real_type scalar data. * - * Data is set similarly to an array enttry in the method + * Data is set similarly to an array enttry in the method * initData(Real_ptr& ptr...) above. */ void initData(Real_type& d, @@ -167,13 +175,16 @@ void initData(Real_type& d, /*! * \brief Calculate and return checksum for data arrays. - * + * * Checksums are computed as a weighted sum of array entries, * where weight is a simple function of elemtn index. * * Checksumn is multiplied by given scale factor. */ -long double calcChecksum(Real_ptr d, int len, +long double calcChecksum(Int_ptr d, int len, + Real_type scale_factor = 1.0); +/// +long double calcChecksum(Real_ptr d, int len, Real_type scale_factor = 1.0); /// long double calcChecksum(Complex_ptr d, int len, From d672bda782e2a9a4fc298774172228eaae05e49c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 7 Jun 2021 16:00:05 -0700 Subject: [PATCH 003/392] Fixup naming in INIT3 --- src/basic/REDUCE3_INT-Seq.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/basic/REDUCE3_INT-Seq.cpp b/src/basic/REDUCE3_INT-Seq.cpp index 475299506..91f76cee0 100644 --- a/src/basic/REDUCE3_INT-Seq.cpp +++ b/src/basic/REDUCE3_INT-Seq.cpp @@ -13,7 +13,7 @@ #include #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -55,9 +55,9 @@ void REDUCE3_INT::runSeqVariant(VariantID vid) #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { - auto init3_base_lam = [=](Index_type i) -> Int_type { - return vec[i]; - }; + auto reduce3_base_lam = [=](Index_type i) -> Int_type { + return vec[i]; + }; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -67,9 +67,9 @@ void REDUCE3_INT::runSeqVariant(VariantID vid) Int_type vmax = m_vmax_init; for (Index_type i = ibegin; i < iend; ++i ) { - vsum += init3_base_lam(i); - vmin = RAJA_MIN(vmin, init3_base_lam(i)); - vmax = RAJA_MAX(vmax, init3_base_lam(i)); + vsum += reduce3_base_lam(i); + vmin = RAJA_MIN(vmin, reduce3_base_lam(i)); + vmax = RAJA_MAX(vmax, reduce3_base_lam(i)); } m_vsum += vsum; From dd3b541fc5a424444287ca335af4c5da7ba7148e Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 7 Jun 2021 16:01:52 -0700 Subject: [PATCH 004/392] Add Basic INDEXLIST kernel INDEXLIST creates an index list by making a list of indices of items with negative values. --- src/CMakeLists.txt | 2 + src/basic/CMakeLists.txt | 5 ++ src/basic/INDEXLIST-Cuda.cpp | 93 +++++++++++++++++++++++++++++++ src/basic/INDEXLIST-Hip.cpp | 93 +++++++++++++++++++++++++++++++ src/basic/INDEXLIST-OMP.cpp | 86 +++++++++++++++++++++++++++++ src/basic/INDEXLIST-Seq.cpp | 104 +++++++++++++++++++++++++++++++++++ src/basic/INDEXLIST.cpp | 63 +++++++++++++++++++++ src/basic/INDEXLIST.hpp | 73 ++++++++++++++++++++++++ src/common/RAJAPerfSuite.cpp | 6 ++ src/common/RAJAPerfSuite.hpp | 1 + 10 files changed, 526 insertions(+) create mode 100644 src/basic/INDEXLIST-Cuda.cpp create mode 100644 src/basic/INDEXLIST-Hip.cpp create mode 100644 src/basic/INDEXLIST-OMP.cpp create mode 100644 src/basic/INDEXLIST-Seq.cpp create mode 100644 src/basic/INDEXLIST.cpp create mode 100644 src/basic/INDEXLIST.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a767617b0..8ac8dd8a0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -67,6 +67,8 @@ blt_add_executable( basic/IF_QUAD.cpp basic/IF_QUAD-Seq.cpp basic/IF_QUAD-OMPTarget.cpp + basic/INDEXLIST.cpp + basic/INDEXLIST-Seq.cpp basic/INIT3.cpp basic/INIT3-Seq.cpp basic/INIT3-OMPTarget.cpp diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index 15cbd38c7..b633d714a 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -20,6 +20,11 @@ blt_add_library( IF_QUAD-Cuda.cpp IF_QUAD-OMP.cpp IF_QUAD-OMPTarget.cpp + INDEXLIST.cpp + INDEXLIST-Seq.cpp + INDEXLIST-Hip.cpp + INDEXLIST-Cuda.cpp + INDEXLIST-OMP.cpp INIT3.cpp INIT3-Seq.cpp INIT3-Hip.cpp diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp new file mode 100644 index 000000000..17b5c8653 --- /dev/null +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -0,0 +1,93 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INDEXLIST.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for CUDA execution + // + const size_t block_size = 256; + + +#define INDEXLIST_DATA_SETUP_CUDA \ + Index_type* counts; \ + allocCudaDeviceData(counts, getRunSize()+1); \ + allocAndInitCudaDeviceData(x, m_x, iend); \ + allocAndInitCudaDeviceData(list, m_list, iend); + +#define INDEXLIST_DATA_TEARDOWN_CUDA \ + deallocCudaDeviceData(counts); \ + getCudaDeviceData(m_list, list, iend); \ + deallocCudaDeviceData(x); \ + deallocCudaDeviceData(list); + + +void INDEXLIST::runCudaVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + INDEXLIST_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + INDEXLIST_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum len(0); + + RAJA::forall< RAJA::cuda_exec >( + RAJA::RangeSegment(ibegin, iend), + [=] __device__ (Index_type i) { + counts[i] = (INDEXLIST_CONDITIONAL) ? 1 : 0; + }); + + RAJA::exclusive_scan_inplace< RAJA::cuda_exec >( + RAJA::make_span(counts+ibegin, iend+1-ibegin)); + + RAJA::forall< RAJA::cuda_exec >( + RAJA::RangeSegment(ibegin, iend), + [=] __device__ (Index_type i) { + if (counts[i] != counts[i+1]) { + list[counts[i]] = i; + len += 1; + } + }); + + m_len = len.get(); + + } + stopTimer(); + + INDEXLIST_DATA_TEARDOWN_CUDA; + + } else { + std::cout << "\n INDEXLIST : Unknown variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp new file mode 100644 index 000000000..c0c71fb8f --- /dev/null +++ b/src/basic/INDEXLIST-Hip.cpp @@ -0,0 +1,93 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INDEXLIST.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for HIP execution + // + const size_t block_size = 256; + + +#define INDEXLIST_DATA_SETUP_HIP \ + Index_type* counts; \ + allocHipDeviceData(counts, getRunSize()+1); \ + allocAndInitHipDeviceData(x, m_x, iend); \ + allocAndInitHipDeviceData(list, m_list, iend); + +#define INDEXLIST_DATA_TEARDOWN_HIP \ + deallocHipDeviceData(counts); \ + getHipDeviceData(m_list, list, iend); \ + deallocHipDeviceData(x); \ + deallocHipDeviceData(list); + + +void INDEXLIST::runHipVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + INDEXLIST_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + INDEXLIST_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum len(0); + + RAJA::forall< RAJA::hip_exec >( + RAJA::RangeSegment(ibegin, iend), + [=] __device__ (Index_type i) { + counts[i] = (INDEXLIST_CONDITIONAL) ? 1 : 0; + }); + + RAJA::exclusive_scan_inplace< RAJA::hip_exec >( + RAJA::make_span(counts+ibegin, iend+1-ibegin)); + + RAJA::forall< RAJA::hip_exec >( + RAJA::RangeSegment(ibegin, iend), + [=] __device__ (Index_type i) { + if (counts[i] != counts[i+1]) { + list[counts[i]] = i; + len += 1; + } + }); + + m_len = len.get(); + + } + stopTimer(); + + INDEXLIST_DATA_TEARDOWN_HIP; + + } else { + std::cout << "\n INDEXLIST : Unknown variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/basic/INDEXLIST-OMP.cpp b/src/basic/INDEXLIST-OMP.cpp new file mode 100644 index 000000000..85f919ade --- /dev/null +++ b/src/basic/INDEXLIST-OMP.cpp @@ -0,0 +1,86 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INDEXLIST.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +#define INDEXLIST_DATA_SETUP_OMP \ + Index_type* counts = new Index_type[getRunSize()+1]; + +#define INDEXLIST_DATA_TEARDOWN_OMP \ + delete[] counts; counts = nullptr; + + +void INDEXLIST::runOpenMPVariant(VariantID vid) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + INDEXLIST_DATA_SETUP; + + switch ( vid ) { + + case RAJA_OpenMP : { + + INDEXLIST_DATA_SETUP_OMP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum len(0); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + counts[i] = (INDEXLIST_CONDITIONAL) ? 1 : 0; + }); + + RAJA::exclusive_scan_inplace( + RAJA::make_span(counts+ibegin, iend+1-ibegin)); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + if (counts[i] != counts[i+1]) { + list[counts[i]] = i; + len += 1; + } + }); + + m_len = len.get(); + + } + stopTimer(); + + INDEXLIST_DATA_TEARDOWN_OMP; + + break; + } + + default : { + std::cout << "\n INDEXLIST : Unknown variant id = " << vid << std::endl; + } + + } + +#endif +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/INDEXLIST-Seq.cpp b/src/basic/INDEXLIST-Seq.cpp new file mode 100644 index 000000000..90033d285 --- /dev/null +++ b/src/basic/INDEXLIST-Seq.cpp @@ -0,0 +1,104 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INDEXLIST.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +void INDEXLIST::runSeqVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + INDEXLIST_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Index_type count = 0; + + for (Index_type i = ibegin; i < iend; ++i ) { + INDEXLIST_BODY; + } + + m_len = count; + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto indexlist_base_lam = [=](Index_type i, Index_type& count) { + INDEXLIST_BODY + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Index_type count = 0; + + for (Index_type i = ibegin; i < iend; ++i ) { + indexlist_base_lam(i, count); + } + + m_len = count; + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Index_type count = 0; + + RAJA::forall( RAJA::RangeSegment(ibegin, iend), + [=, &count](Index_type i) { + INDEXLIST_BODY; + }); + + m_len = count; + + } + stopTimer(); + + break; + } +#endif + + default : { + std::cout << "\n INDEXLIST : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp new file mode 100644 index 000000000..a2918a82d --- /dev/null +++ b/src/basic/INDEXLIST.cpp @@ -0,0 +1,63 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INDEXLIST.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/DataUtils.hpp" + +namespace rajaperf +{ +namespace basic +{ + + +INDEXLIST::INDEXLIST(const RunParams& params) + : KernelBase(rajaperf::Basic_INDEXLIST, params) +{ + setDefaultSize(100000); + setDefaultReps(1000); + + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( RAJA_HIP ); +} + +INDEXLIST::~INDEXLIST() +{ +} + +void INDEXLIST::setUp(VariantID vid) +{ + allocAndInitDataRandSign(m_x, getRunSize(), vid); + allocData(m_list, getRunSize()); + m_len = -1; +} + +void INDEXLIST::updateChecksum(VariantID vid) +{ + checksum[vid] += calcChecksum(m_list, getRunSize()); + checksum[vid] += Checksum_type(m_len); +} + +void INDEXLIST::tearDown(VariantID vid) +{ + (void) vid; + deallocData(m_x); + deallocData(m_list); +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/INDEXLIST.hpp b/src/basic/INDEXLIST.hpp new file mode 100644 index 000000000..db0dbba2b --- /dev/null +++ b/src/basic/INDEXLIST.hpp @@ -0,0 +1,73 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// INDEXLIST kernel reference implementation: +/// +/// Index_type count = 0; +/// for (Index_type i = ibegin; i < iend; ++i ) { +/// if (x[i] < 0.0) { \ +/// list[count++] = i ; \ +/// } +/// } +/// Index_type len = count; +/// + +#ifndef RAJAPerf_Basic_INDEXLIST_HPP +#define RAJAPerf_Basic_INDEXLIST_HPP + +#define INDEXLIST_DATA_SETUP \ + Real_ptr x = m_x; \ + Int_ptr list = m_list; + +#define INDEXLIST_CONDITIONAL \ + x[i] < 0.0 + +#define INDEXLIST_BODY \ + if (INDEXLIST_CONDITIONAL) { \ + list[count++] = i ; \ + } + + +#include "common/KernelBase.hpp" + +namespace rajaperf +{ +class RunParams; + +namespace basic +{ + +class INDEXLIST : public KernelBase +{ +public: + + INDEXLIST(const RunParams& params); + + ~INDEXLIST(); + + void setUp(VariantID vid); + void updateChecksum(VariantID vid); + void tearDown(VariantID vid); + + void runSeqVariant(VariantID vid); + void runOpenMPVariant(VariantID vid); + void runCudaVariant(VariantID vid); + void runHipVariant(VariantID vid); + void runOpenMPTargetVariant(VariantID vid); + +private: + Real_ptr m_x; + Int_ptr m_list; + Index_type m_len; +}; + +} // end namespace basic +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 10eb7c29b..e8c64276e 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -15,6 +15,7 @@ // #include "basic/DAXPY.hpp" #include "basic/IF_QUAD.hpp" +#include "basic/INDEXLIST.hpp" #include "basic/INIT3.hpp" #include "basic/INIT_VIEW1D.hpp" #include "basic/INIT_VIEW1D_OFFSET.hpp" @@ -139,6 +140,7 @@ static const std::string KernelNames [] = // std::string("Basic_DAXPY"), std::string("Basic_IF_QUAD"), + std::string("Basic_INDEXLIST"), std::string("Basic_INIT3"), std::string("Basic_INIT_VIEW1D"), std::string("Basic_INIT_VIEW1D_OFFSET"), @@ -389,6 +391,10 @@ KernelBase* getKernelObject(KernelID kid, kernel = new basic::IF_QUAD(run_params); break; } + case Basic_INDEXLIST : { + kernel = new basic::INDEXLIST(run_params); + break; + } case Basic_INIT3 : { kernel = new basic::INIT3(run_params); break; diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index d6609e460..cc2749747 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -101,6 +101,7 @@ enum KernelID { // Basic_DAXPY = 0, Basic_IF_QUAD, + Basic_INDEXLIST, Basic_INIT3, Basic_INIT_VIEW1D, Basic_INIT_VIEW1D_OFFSET, From 8913ad1b35bfd9a319dd5fa1a15feed52e5b34ba Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 15 Jun 2021 15:57:37 -0700 Subject: [PATCH 005/392] Add openmp base and lambda scans --- src/algorithm/SCAN-OMP.cpp | 43 ++++++++++++++++++++++++++++++++++++++ src/algorithm/SCAN-Seq.cpp | 11 +++++----- src/algorithm/SCAN.cpp | 4 ++++ src/algorithm/SCAN.hpp | 5 +++-- 4 files changed, 55 insertions(+), 8 deletions(-) diff --git a/src/algorithm/SCAN-OMP.cpp b/src/algorithm/SCAN-OMP.cpp index b93e40096..a8a934ae7 100644 --- a/src/algorithm/SCAN-OMP.cpp +++ b/src/algorithm/SCAN-OMP.cpp @@ -30,6 +30,49 @@ void SCAN::runOpenMPVariant(VariantID vid) switch ( vid ) { +#if _OPENMP >= 201811 + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + SCAN_PROLOGUE; + #pragma omp parallel for reduction(inscan, +:scan_var) + for (Index_type i = ibegin; i < iend; ++i ) { + y[i] = scan_var; + #pragma omp scan exclusive(scan_var) + scan_var += x[i]; + } + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + SCAN_PROLOGUE; + auto scan_lam = [=](Index_type i, Real_type& scan_var) { + y[i] = scan_var; + return x[i]; + }; + #pragma omp parallel for reduction(inscan, +:scan_var) + for (Index_type i = ibegin; i < iend; ++i ) { + #pragma omp scan exclusive(scan_var) + scan_var += scan_lam(i, scan_var); + } + + } + stopTimer(); + + break; + } +#endif + case RAJA_OpenMP : { startTimer(); diff --git a/src/algorithm/SCAN-Seq.cpp b/src/algorithm/SCAN-Seq.cpp index cbb6e89cd..13657970f 100644 --- a/src/algorithm/SCAN-Seq.cpp +++ b/src/algorithm/SCAN-Seq.cpp @@ -26,10 +26,6 @@ void SCAN::runSeqVariant(VariantID vid) SCAN_DATA_SETUP; - auto scan_lam = [=](Index_type i) { - SCAN_BODY; - }; - switch ( vid ) { case Base_Seq : { @@ -38,7 +34,7 @@ void SCAN::runSeqVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { SCAN_PROLOGUE; - for (Index_type i = ibegin+1; i < iend; ++i ) { + for (Index_type i = ibegin; i < iend; ++i ) { SCAN_BODY; } @@ -55,7 +51,10 @@ void SCAN::runSeqVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { SCAN_PROLOGUE; - for (Index_type i = ibegin+1; i < iend; ++i ) { + auto scan_lam = [=, &scan_var](Index_type i) { + SCAN_BODY; + }; + for (Index_type i = ibegin; i < iend; ++i ) { scan_lam(i); } diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index 037cca475..c9a8b9847 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -28,6 +28,10 @@ SCAN::SCAN(const RunParams& params) setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); +#if defined(_OPENMP) && _OPENMP >= 201811 + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); +#endif setVariantDefined( RAJA_OpenMP ); setVariantDefined( RAJA_CUDA ); diff --git a/src/algorithm/SCAN.hpp b/src/algorithm/SCAN.hpp index b1d1c53fa..25c5556af 100644 --- a/src/algorithm/SCAN.hpp +++ b/src/algorithm/SCAN.hpp @@ -24,10 +24,11 @@ Real_ptr y = m_y; #define SCAN_PROLOGUE \ - y[ibegin] = 0.0; + Real_type scan_var = 0.0; #define SCAN_BODY \ - y[i] = y[i-1] + x[i-1]; + y[i] = scan_var; \ + scan_var += x[i]; #define RAJA_SCAN_ARGS \ RAJA::make_span(x + ibegin, iend - ibegin), \ From 0c7aa1726668a06876d407240e448554cb1ab248 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 15 Jun 2021 21:39:51 -0700 Subject: [PATCH 006/392] Add SCAN base hip and cuda variants --- src/algorithm/SCAN-Cuda.cpp | 54 ++++++++++++++++++++++- src/algorithm/SCAN-Hip.cpp | 87 +++++++++++++++++++++++++++++++++++-- src/algorithm/SCAN.cpp | 2 + 3 files changed, 138 insertions(+), 5 deletions(-) diff --git a/src/algorithm/SCAN-Cuda.cpp b/src/algorithm/SCAN-Cuda.cpp index 01f888b51..594a3bb40 100644 --- a/src/algorithm/SCAN-Cuda.cpp +++ b/src/algorithm/SCAN-Cuda.cpp @@ -12,6 +12,9 @@ #if defined(RAJA_ENABLE_CUDA) +#include "cub/device/device_scan.cuh" +#include "cub/util_allocator.cuh" + #include "common/CudaDataUtils.hpp" #include @@ -45,7 +48,56 @@ void SCAN::runCudaVariant(VariantID vid) SCAN_DATA_SETUP; - if ( vid == RAJA_CUDA ) { + if ( vid == Base_CUDA ) { + + SCAN_DATA_SETUP_CUDA; + + cudaStream_t stream = 0; + + RAJA::operators::plus binary_op; + Real_type init_val = 0.0; + + int len = iend - ibegin; + + // Determine temporary device storage requirements + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, + temp_storage_bytes, + x+ibegin, + y+ibegin, + binary_op, + init_val, + len, + stream)); + + // Allocate temporary storage + unsigned char* temp_storage; + allocCudaDeviceData(temp_storage, temp_storage_bytes); + d_temp_storage = temp_storage; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + // Run + cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, + temp_storage_bytes, + x+ibegin, + y+ibegin, + binary_op, + init_val, + len, + stream)); + + } + stopTimer(); + + // Free temporary storage + deallocCudaDeviceData(temp_storage); + + SCAN_DATA_TEARDOWN_CUDA; + + } else if ( vid == RAJA_CUDA ) { SCAN_DATA_SETUP_CUDA; diff --git a/src/algorithm/SCAN-Hip.cpp b/src/algorithm/SCAN-Hip.cpp index 47a9789af..042ae723a 100644 --- a/src/algorithm/SCAN-Hip.cpp +++ b/src/algorithm/SCAN-Hip.cpp @@ -12,6 +12,14 @@ #if defined(RAJA_ENABLE_HIP) +#if defined(__HIPCC__) +#define ROCPRIM_HIP_API 1 +#include "rocprim/device/device_scan.hpp" +#elif defined(__CUDACC__) +#include "cub/device/device_scan.cuh" +#include "cub/util_allocator.cuh" +#endif + #include "common/HipDataUtils.hpp" #include @@ -28,11 +36,11 @@ namespace algorithm #define SCAN_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(x, m_x, iend*run_reps); \ - allocAndInitHipDeviceData(y, m_y, iend*run_reps); + allocAndInitHipDeviceData(x, m_x, iend); \ + allocAndInitHipDeviceData(y, m_y, iend); #define SCAN_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_y, y, iend*run_reps); \ + getHipDeviceData(m_y, y, iend); \ deallocHipDeviceData(x); \ deallocHipDeviceData(y); @@ -45,7 +53,78 @@ void SCAN::runHipVariant(VariantID vid) SCAN_DATA_SETUP; - if ( vid == RAJA_HIP ) { + if ( vid == Base_HIP ) { + + SCAN_DATA_SETUP_HIP; + + hipStream_t stream = 0; + + RAJA::operators::plus binary_op; + Real_type init_val = 0.0; + + int len = iend - ibegin; + + // Determine temporary device storage requirements + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; +#if defined(__HIPCC__) + hipErrchk(::rocprim::exclusive_scan(d_temp_storage, + temp_storage_bytes, + x+ibegin, + y+ibegin, + init_val, + len, + binary_op, + stream)); +#elif defined(__CUDACC__) + hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, + temp_storage_bytes, + x+ibegin, + y+ibegin, + binary_op, + init_val, + len, + stream)); +#endif + + // Allocate temporary storage + unsigned char* temp_storage; + allocHipDeviceData(temp_storage, temp_storage_bytes); + d_temp_storage = temp_storage; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + // Run +#if defined(__HIPCC__) + hipErrchk(::rocprim::exclusive_scan(d_temp_storage, + temp_storage_bytes, + x+ibegin, + y+ibegin, + init_val, + len, + binary_op, + stream)); +#elif defined(__CUDACC__) + hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, + temp_storage_bytes, + x+ibegin, + y+ibegin, + binary_op, + init_val, + len, + stream)); +#endif + + } + stopTimer(); + + // Free temporary storage + deallocHipDeviceData(temp_storage); + + SCAN_DATA_TEARDOWN_HIP; + + } else if ( vid == RAJA_HIP ) { SCAN_DATA_SETUP_HIP; diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index c9a8b9847..efc95512a 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -34,8 +34,10 @@ SCAN::SCAN(const RunParams& params) #endif setVariantDefined( RAJA_OpenMP ); + setVariantDefined( Base_CUDA ); setVariantDefined( RAJA_CUDA ); + setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); } From f5cf3a960d833c51262972becbed97b2d5d7a16c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 15 Jun 2021 22:15:00 -0700 Subject: [PATCH 007/392] Add base cuda and hip INDEXLIST variants --- src/basic/INDEXLIST-Cuda.cpp | 89 +++++++++++++++++++++++++++- src/basic/INDEXLIST-Hip.cpp | 111 ++++++++++++++++++++++++++++++++++- src/basic/INDEXLIST.cpp | 2 + 3 files changed, 200 insertions(+), 2 deletions(-) diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index 17b5c8653..f47fc8ff6 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -40,6 +40,34 @@ namespace basic deallocCudaDeviceData(list); +__global__ void indexlist_conditional(Real_ptr x, + Int_ptr list, + Index_type* counts, + Index_type iend) +{ + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < iend) { + counts[i] = (INDEXLIST_CONDITIONAL) ? 1 : 0; + } +} + +__global__ void indexlist_make_list(Int_ptr list, + Index_type* counts, + Index_type* len, + Index_type iend) +{ + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < iend) { + if (counts[i] != counts[i+1]) { + list[counts[i]] = i; + } + if (i == iend-1) { + *len = counts[i+1]; + } + } +} + + void INDEXLIST::runCudaVariant(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -48,7 +76,66 @@ void INDEXLIST::runCudaVariant(VariantID vid) INDEXLIST_DATA_SETUP; - if ( vid == RAJA_CUDA ) { + if ( vid == Base_CUDA ) { + + INDEXLIST_DATA_SETUP_CUDA; + + Index_type* len; + allocCudaPinnedData(len, 1); + + cudaStream_t stream = RAJA::resources::Cuda::get_default().get_stream(); + + RAJA::operators::plus binary_op; + Index_type init_val = 0; + int scan_size = iend+1 - ibegin; + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, + temp_storage_bytes, + counts+ibegin, + counts+ibegin, + binary_op, + init_val, + scan_size, + stream)); + + unsigned char* temp_storage; + allocCudaDeviceData(temp_storage, temp_storage_bytes); + d_temp_storage = temp_storage; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + indexlist_conditional<<>>( + x, list, counts, iend ); + cudaErrchk( cudaGetLastError() ); + + cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, + temp_storage_bytes, + counts+ibegin, + counts+ibegin, + binary_op, + init_val, + scan_size, + stream)); + + indexlist_make_list<<>>( + list, counts, len, iend ); + cudaErrchk( cudaGetLastError() ); + + cudaErrchk( cudaStreamSynchronize(stream) ); + m_len = *len; + + } + stopTimer(); + + deallocCudaDeviceData(temp_storage); + deallocCudaPinnedData(len); + + INDEXLIST_DATA_TEARDOWN_CUDA; + + } else if ( vid == RAJA_CUDA ) { INDEXLIST_DATA_SETUP_CUDA; diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp index c0c71fb8f..87f3bc889 100644 --- a/src/basic/INDEXLIST-Hip.cpp +++ b/src/basic/INDEXLIST-Hip.cpp @@ -40,6 +40,34 @@ namespace basic deallocHipDeviceData(list); +__global__ void indexlist_conditional(Real_ptr x, + Int_ptr list, + Index_type* counts, + Index_type iend) +{ + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < iend) { + counts[i] = (INDEXLIST_CONDITIONAL) ? 1 : 0; + } +} + +__global__ void indexlist_make_list(Int_ptr list, + Index_type* counts, + Index_type* len, + Index_type iend) +{ + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < iend) { + if (counts[i] != counts[i+1]) { + list[counts[i]] = i; + } + if (i == iend-1) { + *len = counts[i+1]; + } + } +} + + void INDEXLIST::runHipVariant(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -48,7 +76,88 @@ void INDEXLIST::runHipVariant(VariantID vid) INDEXLIST_DATA_SETUP; - if ( vid == RAJA_HIP ) { + if ( vid == Base_HIP ) { + + INDEXLIST_DATA_SETUP_HIP; + + Index_type* len; + allocHipPinnedData(len, 1); + + hipStream_t stream = RAJA::resources::Hip::get_default().get_stream(); + + RAJA::operators::plus binary_op; + Index_type init_val = 0; + int scan_size = iend+1 - ibegin; + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; +#if defined(__HIPCC__) + hipErrchk(::rocprim::exclusive_scan(d_temp_storage, + temp_storage_bytes, + counts+ibegin, + counts+ibegin, + init_val, + scan_size, + binary_op, + stream)); +#elif defined(__CUDACC__) + hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, + temp_storage_bytes, + counts+ibegin, + counts+ibegin, + binary_op, + init_val, + scan_size, + stream)); +#endif + + unsigned char* temp_storage; + allocHipDeviceData(temp_storage, temp_storage_bytes); + d_temp_storage = temp_storage; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + hipLaunchKernelGGL((indexlist_conditional), grid_size, block_size, 0, stream, + x, list, counts, iend ); + hipErrchk( hipGetLastError() ); + +#if defined(__HIPCC__) + hipErrchk(::rocprim::exclusive_scan(d_temp_storage, + temp_storage_bytes, + counts+ibegin, + counts+ibegin, + init_val, + scan_size, + binary_op, + stream)); +#elif defined(__CUDACC__) + hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, + temp_storage_bytes, + counts+ibegin, + counts+ibegin, + binary_op, + init_val, + scan_size, + stream)); +#endif + + hipLaunchKernelGGL((indexlist_make_list), grid_size, block_size, 0, stream, + list, counts, len, iend ); + hipErrchk( hipGetLastError() ); + + hipErrchk( hipStreamSynchronize(stream) ); + m_len = *len; + + } + stopTimer(); + + deallocHipDeviceData(temp_storage); + deallocHipPinnedData(len); + + INDEXLIST_DATA_TEARDOWN_HIP; + + } else if ( vid == RAJA_HIP ) { INDEXLIST_DATA_SETUP_HIP; diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index a2918a82d..0a05ac48b 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -30,8 +30,10 @@ INDEXLIST::INDEXLIST(const RunParams& params) setVariantDefined( RAJA_OpenMP ); + setVariantDefined( Base_CUDA ); setVariantDefined( RAJA_CUDA ); + setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); } From 80752bcf9585ddddc5a77b4c9b50035848d28482 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 15 Jun 2021 22:15:11 -0700 Subject: [PATCH 008/392] Consistently init INDEXLIST list --- src/basic/INDEXLIST.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index 0a05ac48b..c59b3bc41 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -44,7 +44,7 @@ INDEXLIST::~INDEXLIST() void INDEXLIST::setUp(VariantID vid) { allocAndInitDataRandSign(m_x, getRunSize(), vid); - allocData(m_list, getRunSize()); + allocAndInitData(m_list, getRunSize(), vid); m_len = -1; } From e332f5ea345d9ce978683167d72c4208bc3a8669 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 15 Jun 2021 22:28:17 -0700 Subject: [PATCH 009/392] Add openmp INDEXLIST variants One of these segfaults with nvcc10+clang11 on rzansel --- src/basic/INDEXLIST-OMP.cpp | 58 +++++++++++++++++++++++++++++++++++++ src/basic/INDEXLIST.cpp | 4 +++ 2 files changed, 62 insertions(+) diff --git a/src/basic/INDEXLIST-OMP.cpp b/src/basic/INDEXLIST-OMP.cpp index 85f919ade..214890020 100644 --- a/src/basic/INDEXLIST-OMP.cpp +++ b/src/basic/INDEXLIST-OMP.cpp @@ -36,6 +36,64 @@ void INDEXLIST::runOpenMPVariant(VariantID vid) switch ( vid ) { +#if _OPENMP >= 201811 + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Index_type count = 0; + + #pragma omp parallel for reduction(inscan, +:count) + for (Index_type i = ibegin; i < iend; ++i ) { + Index_type inc = 0; + if (INDEXLIST_CONDITIONAL) { + list[count] = i ; + inc = 1; + } + #pragma omp scan exclusive(count) + count += inc; + } + + m_len = count; + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP : { + + auto indexlist_base_lam = [=](Index_type i, Index_type& count) { + Index_type inc = 0; + if (INDEXLIST_CONDITIONAL) { + list[count] = i ; + inc = 1; + } + return inc; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Index_type count = 0; + + #pragma omp parallel for reduction(inscan, +:count) + for (Index_type i = ibegin; i < iend; ++i ) { + #pragma omp scan exclusive(count) + count += indexlist_base_lam(i, count); + } + + m_len = count; + + } + stopTimer(); + + break; + } +#endif + case RAJA_OpenMP : { INDEXLIST_DATA_SETUP_OMP; diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index c59b3bc41..ded88859d 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -28,6 +28,10 @@ INDEXLIST::INDEXLIST(const RunParams& params) setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); +#if defined(_OPENMP) && _OPENMP >= 201811 + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); +#endif setVariantDefined( RAJA_OpenMP ); setVariantDefined( Base_CUDA ); From 64995290760543bde9bf1cb930faf4a992f6f45e Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 22 Jun 2021 13:04:35 -0700 Subject: [PATCH 010/392] Remove 3 loop variants of INDEXLIST --- src/basic/INDEXLIST-Cuda.cpp | 148 +----------------------------- src/basic/INDEXLIST-Hip.cpp | 170 +---------------------------------- src/basic/INDEXLIST-OMP.cpp | 44 --------- src/basic/INDEXLIST.cpp | 7 -- 4 files changed, 2 insertions(+), 367 deletions(-) diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index f47fc8ff6..36f70b6f4 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -21,155 +21,9 @@ namespace rajaperf namespace basic { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - -#define INDEXLIST_DATA_SETUP_CUDA \ - Index_type* counts; \ - allocCudaDeviceData(counts, getRunSize()+1); \ - allocAndInitCudaDeviceData(x, m_x, iend); \ - allocAndInitCudaDeviceData(list, m_list, iend); - -#define INDEXLIST_DATA_TEARDOWN_CUDA \ - deallocCudaDeviceData(counts); \ - getCudaDeviceData(m_list, list, iend); \ - deallocCudaDeviceData(x); \ - deallocCudaDeviceData(list); - - -__global__ void indexlist_conditional(Real_ptr x, - Int_ptr list, - Index_type* counts, - Index_type iend) -{ - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < iend) { - counts[i] = (INDEXLIST_CONDITIONAL) ? 1 : 0; - } -} - -__global__ void indexlist_make_list(Int_ptr list, - Index_type* counts, - Index_type* len, - Index_type iend) -{ - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < iend) { - if (counts[i] != counts[i+1]) { - list[counts[i]] = i; - } - if (i == iend-1) { - *len = counts[i+1]; - } - } -} - - void INDEXLIST::runCudaVariant(VariantID vid) { - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - INDEXLIST_DATA_SETUP; - - if ( vid == Base_CUDA ) { - - INDEXLIST_DATA_SETUP_CUDA; - - Index_type* len; - allocCudaPinnedData(len, 1); - - cudaStream_t stream = RAJA::resources::Cuda::get_default().get_stream(); - - RAJA::operators::plus binary_op; - Index_type init_val = 0; - int scan_size = iend+1 - ibegin; - void* d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, - temp_storage_bytes, - counts+ibegin, - counts+ibegin, - binary_op, - init_val, - scan_size, - stream)); - - unsigned char* temp_storage; - allocCudaDeviceData(temp_storage, temp_storage_bytes); - d_temp_storage = temp_storage; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - indexlist_conditional<<>>( - x, list, counts, iend ); - cudaErrchk( cudaGetLastError() ); - - cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, - temp_storage_bytes, - counts+ibegin, - counts+ibegin, - binary_op, - init_val, - scan_size, - stream)); - - indexlist_make_list<<>>( - list, counts, len, iend ); - cudaErrchk( cudaGetLastError() ); - - cudaErrchk( cudaStreamSynchronize(stream) ); - m_len = *len; - - } - stopTimer(); - - deallocCudaDeviceData(temp_storage); - deallocCudaPinnedData(len); - - INDEXLIST_DATA_TEARDOWN_CUDA; - - } else if ( vid == RAJA_CUDA ) { - - INDEXLIST_DATA_SETUP_CUDA; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum len(0); - - RAJA::forall< RAJA::cuda_exec >( - RAJA::RangeSegment(ibegin, iend), - [=] __device__ (Index_type i) { - counts[i] = (INDEXLIST_CONDITIONAL) ? 1 : 0; - }); - - RAJA::exclusive_scan_inplace< RAJA::cuda_exec >( - RAJA::make_span(counts+ibegin, iend+1-ibegin)); - - RAJA::forall< RAJA::cuda_exec >( - RAJA::RangeSegment(ibegin, iend), - [=] __device__ (Index_type i) { - if (counts[i] != counts[i+1]) { - list[counts[i]] = i; - len += 1; - } - }); - - m_len = len.get(); - - } - stopTimer(); - - INDEXLIST_DATA_TEARDOWN_CUDA; - - } else { + { std::cout << "\n INDEXLIST : Unknown variant id = " << vid << std::endl; } } diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp index 87f3bc889..c63b115fd 100644 --- a/src/basic/INDEXLIST-Hip.cpp +++ b/src/basic/INDEXLIST-Hip.cpp @@ -21,177 +21,9 @@ namespace rajaperf namespace basic { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - -#define INDEXLIST_DATA_SETUP_HIP \ - Index_type* counts; \ - allocHipDeviceData(counts, getRunSize()+1); \ - allocAndInitHipDeviceData(x, m_x, iend); \ - allocAndInitHipDeviceData(list, m_list, iend); - -#define INDEXLIST_DATA_TEARDOWN_HIP \ - deallocHipDeviceData(counts); \ - getHipDeviceData(m_list, list, iend); \ - deallocHipDeviceData(x); \ - deallocHipDeviceData(list); - - -__global__ void indexlist_conditional(Real_ptr x, - Int_ptr list, - Index_type* counts, - Index_type iend) -{ - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < iend) { - counts[i] = (INDEXLIST_CONDITIONAL) ? 1 : 0; - } -} - -__global__ void indexlist_make_list(Int_ptr list, - Index_type* counts, - Index_type* len, - Index_type iend) -{ - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < iend) { - if (counts[i] != counts[i+1]) { - list[counts[i]] = i; - } - if (i == iend-1) { - *len = counts[i+1]; - } - } -} - - void INDEXLIST::runHipVariant(VariantID vid) { - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - INDEXLIST_DATA_SETUP; - - if ( vid == Base_HIP ) { - - INDEXLIST_DATA_SETUP_HIP; - - Index_type* len; - allocHipPinnedData(len, 1); - - hipStream_t stream = RAJA::resources::Hip::get_default().get_stream(); - - RAJA::operators::plus binary_op; - Index_type init_val = 0; - int scan_size = iend+1 - ibegin; - void* d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; -#if defined(__HIPCC__) - hipErrchk(::rocprim::exclusive_scan(d_temp_storage, - temp_storage_bytes, - counts+ibegin, - counts+ibegin, - init_val, - scan_size, - binary_op, - stream)); -#elif defined(__CUDACC__) - hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, - temp_storage_bytes, - counts+ibegin, - counts+ibegin, - binary_op, - init_val, - scan_size, - stream)); -#endif - - unsigned char* temp_storage; - allocHipDeviceData(temp_storage, temp_storage_bytes); - d_temp_storage = temp_storage; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((indexlist_conditional), grid_size, block_size, 0, stream, - x, list, counts, iend ); - hipErrchk( hipGetLastError() ); - -#if defined(__HIPCC__) - hipErrchk(::rocprim::exclusive_scan(d_temp_storage, - temp_storage_bytes, - counts+ibegin, - counts+ibegin, - init_val, - scan_size, - binary_op, - stream)); -#elif defined(__CUDACC__) - hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, - temp_storage_bytes, - counts+ibegin, - counts+ibegin, - binary_op, - init_val, - scan_size, - stream)); -#endif - - hipLaunchKernelGGL((indexlist_make_list), grid_size, block_size, 0, stream, - list, counts, len, iend ); - hipErrchk( hipGetLastError() ); - - hipErrchk( hipStreamSynchronize(stream) ); - m_len = *len; - - } - stopTimer(); - - deallocHipDeviceData(temp_storage); - deallocHipPinnedData(len); - - INDEXLIST_DATA_TEARDOWN_HIP; - - } else if ( vid == RAJA_HIP ) { - - INDEXLIST_DATA_SETUP_HIP; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum len(0); - - RAJA::forall< RAJA::hip_exec >( - RAJA::RangeSegment(ibegin, iend), - [=] __device__ (Index_type i) { - counts[i] = (INDEXLIST_CONDITIONAL) ? 1 : 0; - }); - - RAJA::exclusive_scan_inplace< RAJA::hip_exec >( - RAJA::make_span(counts+ibegin, iend+1-ibegin)); - - RAJA::forall< RAJA::hip_exec >( - RAJA::RangeSegment(ibegin, iend), - [=] __device__ (Index_type i) { - if (counts[i] != counts[i+1]) { - list[counts[i]] = i; - len += 1; - } - }); - - m_len = len.get(); - - } - stopTimer(); - - INDEXLIST_DATA_TEARDOWN_HIP; - - } else { + { std::cout << "\n INDEXLIST : Unknown variant id = " << vid << std::endl; } } diff --git a/src/basic/INDEXLIST-OMP.cpp b/src/basic/INDEXLIST-OMP.cpp index 214890020..0dfb56e57 100644 --- a/src/basic/INDEXLIST-OMP.cpp +++ b/src/basic/INDEXLIST-OMP.cpp @@ -17,13 +17,6 @@ namespace rajaperf namespace basic { -#define INDEXLIST_DATA_SETUP_OMP \ - Index_type* counts = new Index_type[getRunSize()+1]; - -#define INDEXLIST_DATA_TEARDOWN_OMP \ - delete[] counts; counts = nullptr; - - void INDEXLIST::runOpenMPVariant(VariantID vid) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -94,43 +87,6 @@ void INDEXLIST::runOpenMPVariant(VariantID vid) } #endif - case RAJA_OpenMP : { - - INDEXLIST_DATA_SETUP_OMP; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum len(0); - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), - [=](Index_type i) { - counts[i] = (INDEXLIST_CONDITIONAL) ? 1 : 0; - }); - - RAJA::exclusive_scan_inplace( - RAJA::make_span(counts+ibegin, iend+1-ibegin)); - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), - [=](Index_type i) { - if (counts[i] != counts[i+1]) { - list[counts[i]] = i; - len += 1; - } - }); - - m_len = len.get(); - - } - stopTimer(); - - INDEXLIST_DATA_TEARDOWN_OMP; - - break; - } - default : { std::cout << "\n INDEXLIST : Unknown variant id = " << vid << std::endl; } diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index ded88859d..68608b6f3 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -32,13 +32,6 @@ INDEXLIST::INDEXLIST(const RunParams& params) setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); #endif - setVariantDefined( RAJA_OpenMP ); - - setVariantDefined( Base_CUDA ); - setVariantDefined( RAJA_CUDA ); - - setVariantDefined( Base_HIP ); - setVariantDefined( RAJA_HIP ); } INDEXLIST::~INDEXLIST() From 1e93b9787bc51f1a749dc09d3aa46ccbb13a518f Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 22 Jun 2021 13:04:57 -0700 Subject: [PATCH 011/392] Set default sizes to 1M --- src/algorithm/SCAN.cpp | 4 ++-- src/basic/INDEXLIST.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index efc95512a..c01d5eda9 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -21,8 +21,8 @@ namespace algorithm SCAN::SCAN(const RunParams& params) : KernelBase(rajaperf::Algorithm_SCAN, params) { - setDefaultSize(100000); - setDefaultReps(50); + setDefaultSize(1000000); + setDefaultReps(20); setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index 68608b6f3..62b20f47d 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -21,8 +21,8 @@ namespace basic INDEXLIST::INDEXLIST(const RunParams& params) : KernelBase(rajaperf::Basic_INDEXLIST, params) { - setDefaultSize(100000); - setDefaultReps(1000); + setDefaultSize(1000000); + setDefaultReps(100); setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); From 85a260ac39f57b90f60a4cb7e0ac310491533c0d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 22 Jun 2021 13:05:20 -0700 Subject: [PATCH 012/392] Add INDEXLIST_3LOOP kernel --- src/CMakeLists.txt | 2 + src/basic/CMakeLists.txt | 5 + src/basic/INDEXLIST_3LOOP-Cuda.cpp | 180 +++++++++++++++++++++++++ src/basic/INDEXLIST_3LOOP-Hip.cpp | 202 +++++++++++++++++++++++++++++ src/basic/INDEXLIST_3LOOP-OMP.cpp | 174 +++++++++++++++++++++++++ src/basic/INDEXLIST_3LOOP-Seq.cpp | 164 +++++++++++++++++++++++ src/basic/INDEXLIST_3LOOP.cpp | 69 ++++++++++ src/basic/INDEXLIST_3LOOP.hpp | 73 +++++++++++ src/common/RAJAPerfSuite.cpp | 6 + src/common/RAJAPerfSuite.hpp | 1 + 10 files changed, 876 insertions(+) create mode 100644 src/basic/INDEXLIST_3LOOP-Cuda.cpp create mode 100644 src/basic/INDEXLIST_3LOOP-Hip.cpp create mode 100644 src/basic/INDEXLIST_3LOOP-OMP.cpp create mode 100644 src/basic/INDEXLIST_3LOOP-Seq.cpp create mode 100644 src/basic/INDEXLIST_3LOOP.cpp create mode 100644 src/basic/INDEXLIST_3LOOP.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8ac8dd8a0..d38e2f851 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -69,6 +69,8 @@ blt_add_executable( basic/IF_QUAD-OMPTarget.cpp basic/INDEXLIST.cpp basic/INDEXLIST-Seq.cpp + basic/INDEXLIST_3LOOP.cpp + basic/INDEXLIST_3LOOP-Seq.cpp basic/INIT3.cpp basic/INIT3-Seq.cpp basic/INIT3-OMPTarget.cpp diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index b633d714a..d8244c0b2 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -25,6 +25,11 @@ blt_add_library( INDEXLIST-Hip.cpp INDEXLIST-Cuda.cpp INDEXLIST-OMP.cpp + INDEXLIST_3LOOP.cpp + INDEXLIST_3LOOP-Seq.cpp + INDEXLIST_3LOOP-Hip.cpp + INDEXLIST_3LOOP-Cuda.cpp + INDEXLIST_3LOOP-OMP.cpp INIT3.cpp INIT3-Seq.cpp INIT3-Hip.cpp diff --git a/src/basic/INDEXLIST_3LOOP-Cuda.cpp b/src/basic/INDEXLIST_3LOOP-Cuda.cpp new file mode 100644 index 000000000..9ddbd31dd --- /dev/null +++ b/src/basic/INDEXLIST_3LOOP-Cuda.cpp @@ -0,0 +1,180 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INDEXLIST_3LOOP.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for CUDA execution + // + const size_t block_size = 256; + + +#define INDEXLIST_3LOOP_DATA_SETUP_CUDA \ + Index_type* counts; \ + allocCudaDeviceData(counts, getRunSize()+1); \ + allocAndInitCudaDeviceData(x, m_x, iend); \ + allocAndInitCudaDeviceData(list, m_list, iend); + +#define INDEXLIST_3LOOP_DATA_TEARDOWN_CUDA \ + deallocCudaDeviceData(counts); \ + getCudaDeviceData(m_list, list, iend); \ + deallocCudaDeviceData(x); \ + deallocCudaDeviceData(list); + + +__global__ void indexlist_conditional(Real_ptr x, + Int_ptr list, + Index_type* counts, + Index_type iend) +{ + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < iend) { + counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0; + } +} + +__global__ void indexlist_make_list(Int_ptr list, + Index_type* counts, + Index_type* len, + Index_type iend) +{ + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < iend) { + if (counts[i] != counts[i+1]) { + list[counts[i]] = i; + } + if (i == iend-1) { + *len = counts[i+1]; + } + } +} + + +void INDEXLIST_3LOOP::runCudaVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + INDEXLIST_3LOOP_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + INDEXLIST_3LOOP_DATA_SETUP_CUDA; + + Index_type* len; + allocCudaPinnedData(len, 1); + + cudaStream_t stream = RAJA::resources::Cuda::get_default().get_stream(); + + RAJA::operators::plus binary_op; + Index_type init_val = 0; + int scan_size = iend+1 - ibegin; + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, + temp_storage_bytes, + counts+ibegin, + counts+ibegin, + binary_op, + init_val, + scan_size, + stream)); + + unsigned char* temp_storage; + allocCudaDeviceData(temp_storage, temp_storage_bytes); + d_temp_storage = temp_storage; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + indexlist_conditional<<>>( + x, list, counts, iend ); + cudaErrchk( cudaGetLastError() ); + + cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, + temp_storage_bytes, + counts+ibegin, + counts+ibegin, + binary_op, + init_val, + scan_size, + stream)); + + indexlist_make_list<<>>( + list, counts, len, iend ); + cudaErrchk( cudaGetLastError() ); + + cudaErrchk( cudaStreamSynchronize(stream) ); + m_len = *len; + + } + stopTimer(); + + deallocCudaDeviceData(temp_storage); + deallocCudaPinnedData(len); + + INDEXLIST_3LOOP_DATA_TEARDOWN_CUDA; + + } else if ( vid == RAJA_CUDA ) { + + INDEXLIST_3LOOP_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum len(0); + + RAJA::forall< RAJA::cuda_exec >( + RAJA::RangeSegment(ibegin, iend), + [=] __device__ (Index_type i) { + counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0; + }); + + RAJA::exclusive_scan_inplace< RAJA::cuda_exec >( + RAJA::make_span(counts+ibegin, iend+1-ibegin)); + + RAJA::forall< RAJA::cuda_exec >( + RAJA::RangeSegment(ibegin, iend), + [=] __device__ (Index_type i) { + if (counts[i] != counts[i+1]) { + list[counts[i]] = i; + len += 1; + } + }); + + m_len = len.get(); + + } + stopTimer(); + + INDEXLIST_3LOOP_DATA_TEARDOWN_CUDA; + + } else { + std::cout << "\n INDEXLIST_3LOOP : Unknown variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/basic/INDEXLIST_3LOOP-Hip.cpp b/src/basic/INDEXLIST_3LOOP-Hip.cpp new file mode 100644 index 000000000..cf8a29782 --- /dev/null +++ b/src/basic/INDEXLIST_3LOOP-Hip.cpp @@ -0,0 +1,202 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INDEXLIST_3LOOP.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for HIP execution + // + const size_t block_size = 256; + + +#define INDEXLIST_3LOOP_DATA_SETUP_HIP \ + Index_type* counts; \ + allocHipDeviceData(counts, getRunSize()+1); \ + allocAndInitHipDeviceData(x, m_x, iend); \ + allocAndInitHipDeviceData(list, m_list, iend); + +#define INDEXLIST_3LOOP_DATA_TEARDOWN_HIP \ + deallocHipDeviceData(counts); \ + getHipDeviceData(m_list, list, iend); \ + deallocHipDeviceData(x); \ + deallocHipDeviceData(list); + + +__global__ void indexlist_conditional(Real_ptr x, + Int_ptr list, + Index_type* counts, + Index_type iend) +{ + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < iend) { + counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0; + } +} + +__global__ void indexlist_make_list(Int_ptr list, + Index_type* counts, + Index_type* len, + Index_type iend) +{ + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < iend) { + if (counts[i] != counts[i+1]) { + list[counts[i]] = i; + } + if (i == iend-1) { + *len = counts[i+1]; + } + } +} + + +void INDEXLIST_3LOOP::runHipVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + INDEXLIST_3LOOP_DATA_SETUP; + + if ( vid == Base_HIP ) { + + INDEXLIST_3LOOP_DATA_SETUP_HIP; + + Index_type* len; + allocHipPinnedData(len, 1); + + hipStream_t stream = RAJA::resources::Hip::get_default().get_stream(); + + RAJA::operators::plus binary_op; + Index_type init_val = 0; + int scan_size = iend+1 - ibegin; + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; +#if defined(__HIPCC__) + hipErrchk(::rocprim::exclusive_scan(d_temp_storage, + temp_storage_bytes, + counts+ibegin, + counts+ibegin, + init_val, + scan_size, + binary_op, + stream)); +#elif defined(__CUDACC__) + hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, + temp_storage_bytes, + counts+ibegin, + counts+ibegin, + binary_op, + init_val, + scan_size, + stream)); +#endif + + unsigned char* temp_storage; + allocHipDeviceData(temp_storage, temp_storage_bytes); + d_temp_storage = temp_storage; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + hipLaunchKernelGGL((indexlist_conditional), grid_size, block_size, 0, stream, + x, list, counts, iend ); + hipErrchk( hipGetLastError() ); + +#if defined(__HIPCC__) + hipErrchk(::rocprim::exclusive_scan(d_temp_storage, + temp_storage_bytes, + counts+ibegin, + counts+ibegin, + init_val, + scan_size, + binary_op, + stream)); +#elif defined(__CUDACC__) + hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, + temp_storage_bytes, + counts+ibegin, + counts+ibegin, + binary_op, + init_val, + scan_size, + stream)); +#endif + + hipLaunchKernelGGL((indexlist_make_list), grid_size, block_size, 0, stream, + list, counts, len, iend ); + hipErrchk( hipGetLastError() ); + + hipErrchk( hipStreamSynchronize(stream) ); + m_len = *len; + + } + stopTimer(); + + deallocHipDeviceData(temp_storage); + deallocHipPinnedData(len); + + INDEXLIST_3LOOP_DATA_TEARDOWN_HIP; + + } else if ( vid == RAJA_HIP ) { + + INDEXLIST_3LOOP_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum len(0); + + RAJA::forall< RAJA::hip_exec >( + RAJA::RangeSegment(ibegin, iend), + [=] __device__ (Index_type i) { + counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0; + }); + + RAJA::exclusive_scan_inplace< RAJA::hip_exec >( + RAJA::make_span(counts+ibegin, iend+1-ibegin)); + + RAJA::forall< RAJA::hip_exec >( + RAJA::RangeSegment(ibegin, iend), + [=] __device__ (Index_type i) { + if (counts[i] != counts[i+1]) { + list[counts[i]] = i; + len += 1; + } + }); + + m_len = len.get(); + + } + stopTimer(); + + INDEXLIST_3LOOP_DATA_TEARDOWN_HIP; + + } else { + std::cout << "\n INDEXLIST_3LOOP : Unknown variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/basic/INDEXLIST_3LOOP-OMP.cpp b/src/basic/INDEXLIST_3LOOP-OMP.cpp new file mode 100644 index 000000000..083dc48bf --- /dev/null +++ b/src/basic/INDEXLIST_3LOOP-OMP.cpp @@ -0,0 +1,174 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INDEXLIST_3LOOP.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +#define INDEXLIST_3LOOP_DATA_SETUP_OMP \ + Index_type* counts = new Index_type[getRunSize()+1]; + +#define INDEXLIST_3LOOP_DATA_TEARDOWN_OMP \ + delete[] counts; counts = nullptr; + + +void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + INDEXLIST_3LOOP_DATA_SETUP; + + switch ( vid ) { + +#if _OPENMP >= 201811 + case Base_OpenMP : { + + INDEXLIST_3LOOP_DATA_SETUP_OMP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0; + } + + Index_type count = 0; + + #pragma omp parallel for reduction(inscan, +:count) + for (Index_type i = ibegin; i < iend+1; ++i ) { + Index_type inc = counts[i]; + counts[i] = count; + #pragma omp scan exclusive(count) + count += inc; + } + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + if (counts[i] != counts[i+1]) { + list[counts[i]] = i; + } + } + + m_len = counts[iend]; + + } + stopTimer(); + + INDEXLIST_3LOOP_DATA_TEARDOWN_OMP; + + break; + } + + case Lambda_OpenMP : { + + INDEXLIST_3LOOP_DATA_SETUP_OMP; + + auto indexlist_conditional_lam = [=](Index_type i) { + counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0; + }; + + auto indexlist_make_list_lam = [=](Index_type i) { + if (counts[i] != counts[i+1]) { + list[counts[i]] = i; + } + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + indexlist_conditional_lam(i); + } + + Index_type count = 0; + + #pragma omp parallel for reduction(inscan, +:count) + for (Index_type i = ibegin; i < iend+1; ++i ) { + Index_type inc = counts[i]; + counts[i] = count; + #pragma omp scan exclusive(count) + count += inc; + } + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + indexlist_make_list_lam(i); + } + + m_len = counts[iend]; + + } + stopTimer(); + + INDEXLIST_3LOOP_DATA_TEARDOWN_OMP; + + break; + } +#endif + + case RAJA_OpenMP : { + + INDEXLIST_3LOOP_DATA_SETUP_OMP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum len(0); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0; + }); + + RAJA::exclusive_scan_inplace( + RAJA::make_span(counts+ibegin, iend+1-ibegin)); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + if (counts[i] != counts[i+1]) { + list[counts[i]] = i; + len += 1; + } + }); + + m_len = len.get(); + + } + stopTimer(); + + INDEXLIST_3LOOP_DATA_TEARDOWN_OMP; + + break; + } + + default : { + std::cout << "\n INDEXLIST_3LOOP : Unknown variant id = " << vid << std::endl; + } + + } + +#endif +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/INDEXLIST_3LOOP-Seq.cpp b/src/basic/INDEXLIST_3LOOP-Seq.cpp new file mode 100644 index 000000000..0c9d02426 --- /dev/null +++ b/src/basic/INDEXLIST_3LOOP-Seq.cpp @@ -0,0 +1,164 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INDEXLIST_3LOOP.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +#define INDEXLIST_3LOOP_DATA_SETUP_Seq \ + Index_type* counts = new Index_type[getRunSize()+1]; + +#define INDEXLIST_3LOOP_DATA_TEARDOWN_Seq \ + delete[] counts; counts = nullptr; + + + +void INDEXLIST_3LOOP::runSeqVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + INDEXLIST_3LOOP_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + INDEXLIST_3LOOP_DATA_SETUP_Seq; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0; + } + + Index_type count = 0; + + for (Index_type i = ibegin; i < iend+1; ++i ) { + Index_type inc = counts[i]; + counts[i] = count; + count += inc; + } + + for (Index_type i = ibegin; i < iend; ++i ) { + if (counts[i] != counts[i+1]) { + list[counts[i]] = i; + } + } + + m_len = counts[iend]; + + } + stopTimer(); + + INDEXLIST_3LOOP_DATA_TEARDOWN_Seq; + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + INDEXLIST_3LOOP_DATA_SETUP_Seq; + + auto indexlist_conditional_lam = [=](Index_type i) { + counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0; + }; + + auto indexlist_make_list_lam = [=](Index_type i) { + if (counts[i] != counts[i+1]) { + list[counts[i]] = i; + } + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + indexlist_conditional_lam(i); + } + + Index_type count = 0; + + for (Index_type i = ibegin; i < iend+1; ++i ) { + Index_type inc = counts[i]; + counts[i] = count; + count += inc; + } + + for (Index_type i = ibegin; i < iend; ++i ) { + indexlist_make_list_lam(i); + } + + m_len = counts[iend]; + + } + stopTimer(); + + INDEXLIST_3LOOP_DATA_TEARDOWN_Seq; + + break; + } + + case RAJA_Seq : { + + INDEXLIST_3LOOP_DATA_SETUP_Seq; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum len(0); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0; + }); + + RAJA::exclusive_scan_inplace( + RAJA::make_span(counts+ibegin, iend+1-ibegin)); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + if (counts[i] != counts[i+1]) { + list[counts[i]] = i; + len += 1; + } + }); + + m_len = len.get(); + + } + stopTimer(); + + INDEXLIST_3LOOP_DATA_TEARDOWN_Seq; + + break; + } +#endif + + default : { + std::cout << "\n INDEXLIST_3LOOP : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp new file mode 100644 index 000000000..5857bce5f --- /dev/null +++ b/src/basic/INDEXLIST_3LOOP.cpp @@ -0,0 +1,69 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INDEXLIST_3LOOP.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/DataUtils.hpp" + +namespace rajaperf +{ +namespace basic +{ + + +INDEXLIST_3LOOP::INDEXLIST_3LOOP(const RunParams& params) + : KernelBase(rajaperf::Basic_INDEXLIST_3LOOP, params) +{ + setDefaultSize(1000000); + setDefaultReps(100); + + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + +#if defined(_OPENMP) && _OPENMP >= 201811 + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); +#endif + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( Base_CUDA ); + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( Base_HIP ); + setVariantDefined( RAJA_HIP ); +} + +INDEXLIST_3LOOP::~INDEXLIST_3LOOP() +{ +} + +void INDEXLIST_3LOOP::setUp(VariantID vid) +{ + allocAndInitDataRandSign(m_x, getRunSize(), vid); + allocAndInitData(m_list, getRunSize(), vid); + m_len = -1; +} + +void INDEXLIST_3LOOP::updateChecksum(VariantID vid) +{ + checksum[vid] += calcChecksum(m_list, getRunSize()); + checksum[vid] += Checksum_type(m_len); +} + +void INDEXLIST_3LOOP::tearDown(VariantID vid) +{ + (void) vid; + deallocData(m_x); + deallocData(m_list); +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/INDEXLIST_3LOOP.hpp b/src/basic/INDEXLIST_3LOOP.hpp new file mode 100644 index 000000000..4c1315fac --- /dev/null +++ b/src/basic/INDEXLIST_3LOOP.hpp @@ -0,0 +1,73 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// INDEXLIST_3LOOP kernel reference implementation: +/// +/// Index_type count = 0; +/// for (Index_type i = ibegin; i < iend; ++i ) { +/// if (x[i] < 0.0) { \ +/// list[count++] = i ; \ +/// } +/// } +/// Index_type len = count; +/// + +#ifndef RAJAPerf_Basic_INDEXLIST_3LOOP_HPP +#define RAJAPerf_Basic_INDEXLIST_3LOOP_HPP + +#define INDEXLIST_3LOOP_DATA_SETUP \ + Real_ptr x = m_x; \ + Int_ptr list = m_list; + +#define INDEXLIST_3LOOP_CONDITIONAL \ + x[i] < 0.0 + +#define INDEXLIST_3LOOP_BODY \ + if (INDEXLIST_3LOOP_CONDITIONAL) { \ + list[count++] = i ; \ + } + + +#include "common/KernelBase.hpp" + +namespace rajaperf +{ +class RunParams; + +namespace basic +{ + +class INDEXLIST_3LOOP : public KernelBase +{ +public: + + INDEXLIST_3LOOP(const RunParams& params); + + ~INDEXLIST_3LOOP(); + + void setUp(VariantID vid); + void updateChecksum(VariantID vid); + void tearDown(VariantID vid); + + void runSeqVariant(VariantID vid); + void runOpenMPVariant(VariantID vid); + void runCudaVariant(VariantID vid); + void runHipVariant(VariantID vid); + void runOpenMPTargetVariant(VariantID vid); + +private: + Real_ptr m_x; + Int_ptr m_list; + Index_type m_len; +}; + +} // end namespace basic +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index e8c64276e..79acd7a01 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -16,6 +16,7 @@ #include "basic/DAXPY.hpp" #include "basic/IF_QUAD.hpp" #include "basic/INDEXLIST.hpp" +#include "basic/INDEXLIST_3LOOP.hpp" #include "basic/INIT3.hpp" #include "basic/INIT_VIEW1D.hpp" #include "basic/INIT_VIEW1D_OFFSET.hpp" @@ -141,6 +142,7 @@ static const std::string KernelNames [] = std::string("Basic_DAXPY"), std::string("Basic_IF_QUAD"), std::string("Basic_INDEXLIST"), + std::string("Basic_INDEXLIST_3LOOP"), std::string("Basic_INIT3"), std::string("Basic_INIT_VIEW1D"), std::string("Basic_INIT_VIEW1D_OFFSET"), @@ -395,6 +397,10 @@ KernelBase* getKernelObject(KernelID kid, kernel = new basic::INDEXLIST(run_params); break; } + case Basic_INDEXLIST_3LOOP : { + kernel = new basic::INDEXLIST_3LOOP(run_params); + break; + } case Basic_INIT3 : { kernel = new basic::INIT3(run_params); break; diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index cc2749747..2dc402a9a 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -102,6 +102,7 @@ enum KernelID { Basic_DAXPY = 0, Basic_IF_QUAD, Basic_INDEXLIST, + Basic_INDEXLIST_3LOOP, Basic_INIT3, Basic_INIT_VIEW1D, Basic_INIT_VIEW1D_OFFSET, From e141520331412f4adb776fc450a1542dfbb9a4ba Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 22 Jun 2021 13:27:27 -0700 Subject: [PATCH 013/392] Increase default SCAN reps --- src/algorithm/SCAN.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index c01d5eda9..f4fd90f5b 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -22,7 +22,7 @@ SCAN::SCAN(const RunParams& params) : KernelBase(rajaperf::Algorithm_SCAN, params) { setDefaultSize(1000000); - setDefaultReps(20); + setDefaultReps(100); setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); From 5a8aa8f2af1485bb0e57063c6964d6fd8d385df3 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Wed, 7 Jul 2021 11:44:13 -0700 Subject: [PATCH 014/392] Adding --checkrun -sp to debug builds; Adding --dryrun for device builds. --- Dockerfile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Dockerfile b/Dockerfile index 220b3cb4b..02434f94e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,6 +20,7 @@ COPY --chown=axom:axom . /home/axom/workspace WORKDIR /home/axom/workspace RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=Debug -DENABLE_WARNINGS=On -DENABLE_COVERAGE=On -DENABLE_OPENMP=On .. RUN cd build && make -j 16 +RUN cd build && ./bin/raja-perf.exe --checkrun -sp FROM axom/compilers:gcc-6 AS gcc6 ENV GTEST_COLOR=1 @@ -59,6 +60,7 @@ COPY --chown=axom:axom . /home/axom/workspace WORKDIR /home/axom/workspace RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug -DENABLE_OPENMP=On -DCMAKE_CXX_FLAGS=-fsanitize=address .. RUN cd build && make -j 16 +RUN cd build && ./bin/raja-perf.exe --checkrun -sp FROM axom/compilers:nvcc-10.2 AS nvcc10 ENV GTEST_COLOR=1 @@ -66,6 +68,7 @@ COPY --chown=axom:axom . /home/axom/workspace WORKDIR /home/axom/workspace RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 .. RUN cd build && make -j 2 +RUN cd build && ./bin/raja-perf.exe --dryrun FROM axom/compilers:nvcc-10.2 AS nvcc10-debug ENV GTEST_COLOR=1 @@ -73,6 +76,7 @@ COPY --chown=axom:axom . /home/axom/workspace WORKDIR /home/axom/workspace RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=Debug -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 .. RUN cd build && make -j 2 +RUN cd build && ./bin/raja-perf.exe --dryrun FROM axom/compilers:rocm AS hip ENV GTEST_COLOR=1 @@ -81,6 +85,7 @@ WORKDIR /home/axom/workspace ENV HCC_AMDGPU_TARGET=gfx900 RUN mkdir build && cd build && cmake -DROCM_ROOT_DIR=/opt/rocm/include -DHIP_RUNTIME_INCLUDE_DIRS="/opt/rocm/include;/opt/rocm/hip/include" -DENABLE_HIP=On -DENABLE_OPENMP=Off -DENABLE_CUDA=Off -DENABLE_WARNINGS_AS_ERRORS=Off -DHIP_HIPCC_FLAGS=-fPIC .. RUN cd build && make -j 16 +RUN cd build && ./bin/raja-perf.exe --dryrun FROM axom/compilers:oneapi AS sycl ENV GTEST_COLOR=1 From 61ea6c866e6de93be2670bc5d8a811e771d2f59f Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 7 Jul 2021 14:15:29 -0700 Subject: [PATCH 015/392] Add kernel info to scan kernels --- src/algorithm/SCAN.cpp | 9 +++++++++ src/basic/INDEXLIST.cpp | 12 ++++++++++++ src/basic/INDEXLIST_3LOOP.cpp | 17 +++++++++++++++++ 3 files changed, 38 insertions(+) diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index f4fd90f5b..b36495117 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -24,6 +24,15 @@ SCAN::SCAN(const RunParams& params) setDefaultSize(1000000); setDefaultReps(100); + setProblemSize( getRunSize() ); + + setItsPerRep( getProblemSize() ); + setKernelsPerRep(1); + setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getRunSize() ); + setFLOPsPerRep(1); + + setUsesFeature(Scan); + setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index 62b20f47d..e5e6ebc60 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -24,6 +24,18 @@ INDEXLIST::INDEXLIST(const RunParams& params) setDefaultSize(1000000); setDefaultReps(100); + setProblemSize( getRunSize() ); + + setItsPerRep( getProblemSize() ); + setKernelsPerRep(1); + setBytesPerRep( (1*sizeof(Index_type) + 1*sizeof(Index_type)) + + (1*sizeof(Int_type) + 0*sizeof(Int_type)) * getRunSize() / 2 + // about 50% output + (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getRunSize() ); + setFLOPsPerRep(0); + + setUsesFeature(Forall); + setUsesFeature(Scan); + setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp index 5857bce5f..c75d0aae9 100644 --- a/src/basic/INDEXLIST_3LOOP.cpp +++ b/src/basic/INDEXLIST_3LOOP.cpp @@ -24,6 +24,23 @@ INDEXLIST_3LOOP::INDEXLIST_3LOOP(const RunParams& params) setDefaultSize(1000000); setDefaultReps(100); + setProblemSize( getRunSize() ); + + setItsPerRep( getProblemSize() ); + setKernelsPerRep(3); + setBytesPerRep( (1*sizeof(Int_type) + 0*sizeof(Int_type)) * getRunSize() + + (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getRunSize() + + + (1*sizeof(Index_type) + 1*sizeof(Index_type)) + + (1*sizeof(Int_type) + 1*sizeof(Int_type)) * (getRunSize()+1) + + + (0*sizeof(Int_type) + 1*sizeof(Int_type)) * (getRunSize()+1) + + (1*sizeof(Int_type) + 0*sizeof(Int_type)) * getRunSize() / 2 ); // about 50% output + setFLOPsPerRep(0); + + setUsesFeature(Forall); + setUsesFeature(Scan); + setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); From 319e78413a8450b38291b04688aa86729624071a Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 7 Jul 2021 14:16:11 -0700 Subject: [PATCH 016/392] Update use of defines in INDEXLIST_3LOOP --- src/basic/INDEXLIST_3LOOP-Cuda.cpp | 4 +--- src/basic/INDEXLIST_3LOOP-Hip.cpp | 4 +--- src/basic/INDEXLIST_3LOOP-OMP.cpp | 8 ++------ src/basic/INDEXLIST_3LOOP-Seq.cpp | 8 ++------ src/basic/INDEXLIST_3LOOP.hpp | 6 +++--- 5 files changed, 9 insertions(+), 21 deletions(-) diff --git a/src/basic/INDEXLIST_3LOOP-Cuda.cpp b/src/basic/INDEXLIST_3LOOP-Cuda.cpp index 9ddbd31dd..c769eec4c 100644 --- a/src/basic/INDEXLIST_3LOOP-Cuda.cpp +++ b/src/basic/INDEXLIST_3LOOP-Cuda.cpp @@ -58,9 +58,7 @@ __global__ void indexlist_make_list(Int_ptr list, { Index_type i = blockIdx.x * blockDim.x + threadIdx.x; if (i < iend) { - if (counts[i] != counts[i+1]) { - list[counts[i]] = i; - } + INDEXLIST_3LOOP_MAKE_LIST; if (i == iend-1) { *len = counts[i+1]; } diff --git a/src/basic/INDEXLIST_3LOOP-Hip.cpp b/src/basic/INDEXLIST_3LOOP-Hip.cpp index cf8a29782..101089844 100644 --- a/src/basic/INDEXLIST_3LOOP-Hip.cpp +++ b/src/basic/INDEXLIST_3LOOP-Hip.cpp @@ -58,9 +58,7 @@ __global__ void indexlist_make_list(Int_ptr list, { Index_type i = blockIdx.x * blockDim.x + threadIdx.x; if (i < iend) { - if (counts[i] != counts[i+1]) { - list[counts[i]] = i; - } + INDEXLIST_3LOOP_MAKE_LIST; if (i == iend-1) { *len = counts[i+1]; } diff --git a/src/basic/INDEXLIST_3LOOP-OMP.cpp b/src/basic/INDEXLIST_3LOOP-OMP.cpp index 083dc48bf..32a581b14 100644 --- a/src/basic/INDEXLIST_3LOOP-OMP.cpp +++ b/src/basic/INDEXLIST_3LOOP-OMP.cpp @@ -61,9 +61,7 @@ void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid) #pragma omp parallel for for (Index_type i = ibegin; i < iend; ++i ) { - if (counts[i] != counts[i+1]) { - list[counts[i]] = i; - } + INDEXLIST_3LOOP_MAKE_LIST; } m_len = counts[iend]; @@ -85,9 +83,7 @@ void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid) }; auto indexlist_make_list_lam = [=](Index_type i) { - if (counts[i] != counts[i+1]) { - list[counts[i]] = i; - } + INDEXLIST_3LOOP_MAKE_LIST; }; startTimer(); diff --git a/src/basic/INDEXLIST_3LOOP-Seq.cpp b/src/basic/INDEXLIST_3LOOP-Seq.cpp index 0c9d02426..ffe87ba57 100644 --- a/src/basic/INDEXLIST_3LOOP-Seq.cpp +++ b/src/basic/INDEXLIST_3LOOP-Seq.cpp @@ -55,9 +55,7 @@ void INDEXLIST_3LOOP::runSeqVariant(VariantID vid) } for (Index_type i = ibegin; i < iend; ++i ) { - if (counts[i] != counts[i+1]) { - list[counts[i]] = i; - } + INDEXLIST_3LOOP_MAKE_LIST; } m_len = counts[iend]; @@ -80,9 +78,7 @@ void INDEXLIST_3LOOP::runSeqVariant(VariantID vid) }; auto indexlist_make_list_lam = [=](Index_type i) { - if (counts[i] != counts[i+1]) { - list[counts[i]] = i; - } + INDEXLIST_3LOOP_MAKE_LIST; }; startTimer(); diff --git a/src/basic/INDEXLIST_3LOOP.hpp b/src/basic/INDEXLIST_3LOOP.hpp index 4c1315fac..03bfc206a 100644 --- a/src/basic/INDEXLIST_3LOOP.hpp +++ b/src/basic/INDEXLIST_3LOOP.hpp @@ -28,9 +28,9 @@ #define INDEXLIST_3LOOP_CONDITIONAL \ x[i] < 0.0 -#define INDEXLIST_3LOOP_BODY \ - if (INDEXLIST_3LOOP_CONDITIONAL) { \ - list[count++] = i ; \ +#define INDEXLIST_3LOOP_MAKE_LIST \ + if (counts[i] != counts[i+1]) { \ + list[counts[i]] = i ; \ } From c60540f36836569f2058482096f8b91c9ae51603 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 7 Jul 2021 14:16:29 -0700 Subject: [PATCH 017/392] Update reference implementation in INDEXLIST_3LOOP --- src/basic/INDEXLIST_3LOOP.hpp | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/basic/INDEXLIST_3LOOP.hpp b/src/basic/INDEXLIST_3LOOP.hpp index 03bfc206a..8974ee3bf 100644 --- a/src/basic/INDEXLIST_3LOOP.hpp +++ b/src/basic/INDEXLIST_3LOOP.hpp @@ -9,13 +9,24 @@ /// /// INDEXLIST_3LOOP kernel reference implementation: /// +/// for (Index_type i = ibegin; i < iend; ++i ) { +/// counts[i] = (x[i] < 0.0) ? 1 : 0; +/// } +/// /// Index_type count = 0; +/// for (Index_type i = ibegin; i < iend+1; ++i ) { +/// Index_type inc = counts[i]; +/// counts[i] = count; +/// count += inc; +/// } +/// /// for (Index_type i = ibegin; i < iend; ++i ) { -/// if (x[i] < 0.0) { \ -/// list[count++] = i ; \ +/// if (counts[i] != counts[i+1]) { +/// list[counts[i]] = i; /// } /// } -/// Index_type len = count; +/// +/// Index_type len = counts[iend]; /// #ifndef RAJAPerf_Basic_INDEXLIST_3LOOP_HPP From c041d45a3cd90c9a066addd98e4a28731b87ad00 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 7 Jul 2021 14:26:13 -0700 Subject: [PATCH 018/392] correct INDEXLIST_3LOOP ItsPerRep --- src/basic/INDEXLIST_3LOOP.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp index c75d0aae9..a89314754 100644 --- a/src/basic/INDEXLIST_3LOOP.cpp +++ b/src/basic/INDEXLIST_3LOOP.cpp @@ -26,7 +26,7 @@ INDEXLIST_3LOOP::INDEXLIST_3LOOP(const RunParams& params) setProblemSize( getRunSize() ); - setItsPerRep( getProblemSize() ); + setItsPerRep( 3 * getProblemSize() + 1 ); setKernelsPerRep(3); setBytesPerRep( (1*sizeof(Int_type) + 0*sizeof(Int_type)) * getRunSize() + (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getRunSize() + From 2cd2eacf2d52841fb5cc99875d118de1da061ee5 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 7 Jul 2021 14:32:26 -0700 Subject: [PATCH 019/392] Ignore unused vars in conditionally compiled omp5 code --- src/basic/INDEXLIST-OMP.cpp | 1 + src/common/RAJAPerfSuite.hpp | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/src/basic/INDEXLIST-OMP.cpp b/src/basic/INDEXLIST-OMP.cpp index 0dfb56e57..c7b200e12 100644 --- a/src/basic/INDEXLIST-OMP.cpp +++ b/src/basic/INDEXLIST-OMP.cpp @@ -88,6 +88,7 @@ void INDEXLIST::runOpenMPVariant(VariantID vid) #endif default : { + ignore_unused(run_reps, ibegin, iend, x, list); std::cout << "\n INDEXLIST : Unknown variant id = " << vid << std::endl; } diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 78aff9e01..124060aa9 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -320,6 +320,16 @@ const std::string& getFeatureName(FeatureID vid); */ KernelBase* getKernelObject(KernelID kid, const RunParams& run_params); +/*! + ******************************************************************************* + * + * \brief Empty function used to squash compiler warnings for unused variables. + * + ******************************************************************************* + */ +template < typename... Ts > +inline void ignore_unused(Ts&&...) { } + } // closing brace for rajaperf namespace #endif // closing endif for header file include guard From 68ac598ee06b1db9a8de55a460a7c8edb481c295 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 7 Jul 2021 14:32:37 -0700 Subject: [PATCH 020/392] Fix comment in INDEXLIST --- src/basic/INDEXLIST.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/basic/INDEXLIST.hpp b/src/basic/INDEXLIST.hpp index db0dbba2b..f9c09058f 100644 --- a/src/basic/INDEXLIST.hpp +++ b/src/basic/INDEXLIST.hpp @@ -11,8 +11,8 @@ /// /// Index_type count = 0; /// for (Index_type i = ibegin; i < iend; ++i ) { -/// if (x[i] < 0.0) { \ -/// list[count++] = i ; \ +/// if (x[i] < 0.0) { +/// list[count++] = i ; /// } /// } /// Index_type len = count; From 19c58fb5297f938201251323b93fe85558e36443 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 7 Jul 2021 14:36:44 -0700 Subject: [PATCH 021/392] Fix SCAN FLOPsPerRep --- src/algorithm/SCAN.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index b36495117..aa4695fb0 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -29,7 +29,7 @@ SCAN::SCAN(const RunParams& params) setItsPerRep( getProblemSize() ); setKernelsPerRep(1); setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getRunSize() ); - setFLOPsPerRep(1); + setFLOPsPerRep(1 * getRunSize()); setUsesFeature(Scan); From bed9e96e052c650898bc16ec2d3c877697458be3 Mon Sep 17 00:00:00 2001 From: Mike Date: Wed, 7 Jul 2021 16:11:28 -0700 Subject: [PATCH 022/392] Adding -sp to release builds. --- Dockerfile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index 02434f94e..53e3c9416 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,7 @@ WORKDIR /home/axom/workspace RUN ls RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_WARNINGS=On -DENABLE_OPENMP=On -DRAJA_DEPRECATED_TESTS=On .. RUN cd build && make -j 16 -RUN cd build && ./bin/raja-perf.exe +RUN cd build && ./bin/raja-perf.exe -sp FROM axom/compilers:gcc-5 AS gcc5-debug ENV GTEST_COLOR=1 @@ -28,7 +28,7 @@ COPY --chown=axom:axom . /home/axom/workspace WORKDIR /home/axom/workspace RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_WARNINGS=On -DENABLE_OPENMP=On -DRAJA_ENABLE_RUNTIME_PLUGINS=On .. RUN cd build && make -j 16 -RUN cd build && ./bin/raja-perf.exe +RUN cd build && ./bin/raja-perf.exe -sp FROM axom/compilers:gcc-7 AS gcc7 ENV GTEST_COLOR=1 @@ -36,7 +36,7 @@ COPY --chown=axom:axom . /home/axom/workspace WORKDIR /home/axom/workspace RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_WARNINGS=On -DENABLE_OPENMP=On .. RUN cd build && make -j 16 -RUN cd build && ./bin/raja-perf.exe +RUN cd build && ./bin/raja-perf.exe -sp FROM axom/compilers:gcc-8 AS gcc8 ENV GTEST_COLOR=1 @@ -44,7 +44,7 @@ COPY --chown=axom:axom . /home/axom/workspace WORKDIR /home/axom/workspace RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_WARNINGS=On -DENABLE_OPENMP=On -DRAJA_ENABLE_BOUNDS_CHECK=ON .. RUN cd build && make -j 16 -RUN cd build && ./bin/raja-perf.exe +RUN cd build && ./bin/raja-perf.exe -sp FROM axom/compilers:clang-9 AS clang9 ENV GTEST_COLOR=1 @@ -52,7 +52,7 @@ COPY --chown=axom:axom . /home/axom/workspace WORKDIR /home/axom/workspace RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_FLAGS=-fmodules -DENABLE_OPENMP=On .. RUN cd build && make -j 16 -RUN cd build && ./bin/raja-perf.exe +RUN cd build && ./bin/raja-perf.exe -sp FROM axom/compilers:clang-9 AS clang9-debug ENV GTEST_COLOR=1 From 2e98d31a22ee49143a3ef1f225af8c9145e1e907 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 8 Jul 2021 11:18:23 -0700 Subject: [PATCH 023/392] Add base cuda INDEXLIST implementation This is a somewhat naive implementation where each block waits for the previous to complete. This may be made better by reading multiple previous blocks and completing parts of the grid scan multiple times. It would also help to reduce the number of communicating blocks by assigning more work to a single block. Note that blocks are assigned as we come across them with atomics instead of via blockIdx so blocks only wait on other blocks that are already running. --- src/basic/INDEXLIST-Cuda.cpp | 206 ++++++++++++++++++++++++++++++++++- src/basic/INDEXLIST.cpp | 2 + 2 files changed, 207 insertions(+), 1 deletion(-) diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index 36f70b6f4..8dae5f44e 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -21,9 +21,213 @@ namespace rajaperf namespace basic { + // + // Define thread block size for CUDA execution + // + const size_t block_size = 256; + + +#define INDEXLIST_DATA_SETUP_CUDA \ + allocAndInitCudaDeviceData(x, m_x, iend); \ + allocAndInitCudaDeviceData(list, m_list, iend); + +#define INDEXLIST_DATA_TEARDOWN_CUDA \ + getCudaDeviceData(m_list, list, iend); \ + deallocCudaDeviceData(x); \ + deallocCudaDeviceData(list); + +struct pair +{ + Index_type first, second; +}; + +__device__ pair block_scan(const Index_type inc) +{ + extern __shared__ volatile Index_type s_thread_counts[ ]; + + Index_type val = inc; + s_thread_counts[ threadIdx.x ] = val; + __syncthreads(); + + for ( int i = 1; i < blockDim.x; i *= 2 ) { + const bool participate = threadIdx.x & i; + const int prior_id = threadIdx.x & ~(i-1) - 1; + if ( participate ) { + val = s_thread_counts[ prior_id ] + s_thread_counts[ threadIdx.x ]; + s_thread_counts[ threadIdx.x ] = val; + } + __syncthreads(); + } + + Index_type prior_val = (threadIdx.x > 0) ? s_thread_counts[threadIdx.x-1] : 0; + __syncthreads(); + + return pair { prior_val, val }; +} + +__device__ pair grid_scan(const int block_id, + const Index_type inc, + Index_type* block_counts, + Index_type* grid_counts, + unsigned* block_readys) +{ + const bool first_block = (block_id == 0); + const bool last_block = (block_id+1 == gridDim.x); + const bool first_thread = (threadIdx.x == 0); + const bool last_thread = (threadIdx.x+1 == blockDim.x); + + pair count = block_scan(inc); + + if (last_thread) { + if (first_block) { + block_counts[block_id] = count.second; // write inclusive scan result for block + grid_counts[block_id] = count.second; // write inclusive scan result for grid through block + __threadfence(); // ensure block_counts, grid_counts ready (release) + atomicExch(&block_readys[block_id], 2u); // write block_counts, grid_counts are ready + } else { + block_counts[block_id] = count.second; // write inclusive scan result for block + __threadfence(); // ensure block_counts ready (release) + atomicExch(&block_readys[block_id], 1u); // write block_counts is ready + } + } + + if (!first_block) { + // extern __shared__ volatile int s_block_readys[ ]; // reusing shared memory + + // const int num_participating_threads = (block_id <= blockDim.x) ? block_id : blockDim.x; + + // int prior_block_ready = 0; + // if (threadIdx.x < num_participating_threads) { + // prior_block_ready = block_readys[block_id-1 - threadIdx.x]; + // } + // s_block_readys[threadIdx.x] = prior_block_ready; + // __syncthreads(); + + __shared__ volatile Index_type s_prev_block_count; + + if (first_thread) { + while (atomicCAS(&block_readys[block_id-1], 11u, 11u) != 2u); // check if block_counts is ready + __threadfence(); // ensure block_counts ready (acquire) + s_prev_block_count = grid_counts[block_id-1]; + } + __syncthreads(); + + Index_type prev_block_count = s_prev_block_count; + + count.first = prev_block_count + count.first; + count.second = prev_block_count + count.second; + + if (last_thread) { + grid_counts[block_id] = count.second; // write inclusive scan result for grid through block + __threadfence(); // ensure block_counts ready (release) + atomicExch(&block_readys[block_id], 2u); // write block_counts is ready + } + __syncthreads(); + } + + if (last_block) { + for (int i = threadIdx.x; i < gridDim.x; ++i) { + block_readys[i] = 0u; // last block resets readys to 0 (for next kernel to reuse) + } + } + + return count; +} + +__device__ int get_block_id(unsigned* block_id_inc) +{ + __shared__ volatile unsigned s_block_id; + if (threadIdx.x == 0) { + s_block_id = atomicInc(block_id_inc, gridDim.x-1); + } + __syncthreads(); + unsigned block_id = s_block_id; + __syncthreads(); + return static_cast(block_id); +} + +__global__ void indexlist(Real_ptr x, + Int_ptr list, + Index_type* block_counts, + Index_type* grid_counts, + unsigned* block_readys, + unsigned* block_id_inc, + Index_type* len, + Index_type iend) +{ + const int block_id = get_block_id(block_id_inc); + + Index_type i = block_id * blockDim.x + threadIdx.x; + Index_type inc = 0; + if (i < iend) { + if (INDEXLIST_CONDITIONAL) { + inc = 1; + } + } + + pair count = grid_scan(block_id, inc, block_counts, grid_counts, block_readys); + + if (i < iend) { + if (count.first != count.second) { + list[count.first] = i; + } + if (i == iend-1) { + *len = count.second; + } + } +} + void INDEXLIST::runCudaVariant(VariantID vid) { - { + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + INDEXLIST_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + INDEXLIST_DATA_SETUP_CUDA; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size); + + Index_type* len; + allocCudaPinnedData(len, 1); + Index_type* block_counts; + allocCudaDeviceData(block_counts, grid_size); + Index_type* grid_counts; + allocCudaDeviceData(grid_counts, grid_size); + unsigned* block_readys; + allocCudaDeviceData(block_readys, grid_size); + cudaErrchk( cudaMemset(block_readys, 0, sizeof(unsigned)*grid_size) ); + unsigned* block_id_inc; + allocCudaDeviceData(block_id_inc, grid_size); + cudaErrchk( cudaMemset(block_id_inc, 0, sizeof(unsigned)) ); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + indexlist<<>>( + x+ibegin, list+ibegin, + block_counts, grid_counts, block_readys, block_id_inc, + len, iend-ibegin ); + cudaErrchk( cudaGetLastError() ); + + cudaErrchk( cudaDeviceSynchronize() ); + m_len = *len; + + } + stopTimer(); + + deallocCudaPinnedData(len); + deallocCudaDeviceData(block_counts); + deallocCudaDeviceData(grid_counts); + deallocCudaDeviceData(block_readys); + deallocCudaDeviceData(block_id_inc); + + INDEXLIST_DATA_TEARDOWN_CUDA; + + } else { std::cout << "\n INDEXLIST : Unknown variant id = " << vid << std::endl; } } diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index e5e6ebc60..ac04deb64 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -44,6 +44,8 @@ INDEXLIST::INDEXLIST(const RunParams& params) setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); #endif + + setVariantDefined( Base_CUDA ); } INDEXLIST::~INDEXLIST() From 3a15abb2ebb85554e0c15e1ca19467aaf8c53cef Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Fri, 9 Jul 2021 08:44:27 -0700 Subject: [PATCH 024/392] adding initial yml and .sh scripts for CI --- .gitlab-ci.yml | 113 ++++++++++++++++++++++++++ .gitlab/corona-jobs.yml | 21 +++++ .gitlab/corona-templates.yml | 32 ++++++++ .gitlab/lassen-jobs.yml | 85 ++++++++++++++++++++ .gitlab/lassen-templates.yml | 29 +++++++ .gitlab/ruby-jobs.yml | 48 +++++++++++ .gitlab/ruby-templates.yml | 54 +++++++++++++ scripts/gitlab/build_and_test.sh | 131 +++++++++++++++++++++++++++++++ 8 files changed, 513 insertions(+) create mode 100644 .gitlab-ci.yml create mode 100644 .gitlab/corona-jobs.yml create mode 100644 .gitlab/corona-templates.yml create mode 100644 .gitlab/lassen-jobs.yml create mode 100644 .gitlab/lassen-templates.yml create mode 100644 .gitlab/ruby-jobs.yml create mode 100644 .gitlab/ruby-templates.yml create mode 100755 scripts/gitlab/build_and_test.sh diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 000000000..4794d752e --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,113 @@ +############################################################################## +# Copyright (c) 2016-2020, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +############################################################################### +# General GitLab pipelines configurations for supercomputers and Linux clusters +# at Lawrence Livermore National Laboratory (LLNL). +# +# This entire pipeline is LLNL-specific +############################################################################## + +# We define the following GitLab pipeline variables: +# +# GIT_SUBMODULE_STRATEGY: +# Tells Gitlab to recursively update the submodules when cloning umpire + +# ALLOC_NAME: +# On LLNL's ruby, this pipeline creates only one allocation shared among jobs +# in order to save time and resources. This allocation has to be uniquely named +# so that we are sure to retrieve it. + +# BUILD_ROOT: +# The path to the shared resources between all jobs. The BUILD_ROOT is unique to +# the pipeline, preventing any form of concurrency with other pipelines. This +# also means that the BUILD_ROOT directory will never be cleaned. + +# DEFAULT_TIME: +# Default time to let the Lassen jobs run will be 30 minutes. However, if it is +# a job that requires more time, it will be overwritten in the lassen template +# file. +# TODO: add a clean-up mechanism + +variables: + GIT_SUBMODULE_STRATEGY: recursive + ALLOC_NAME: ${CI_PROJECT_NAME}_ci_${CI_PIPELINE_ID} + BUILD_ROOT: ${CI_PROJECT_DIR} + DEFAULT_TIME: 30 + +# Normally, stages are blocking in Gitlab. However, using the keyword "needs" we +# can express dependencies between job that break the ordering of stages, in +# favor of a DAG. +# In practice r_*, l_* and b_* stages are independently run and start immediately. + +stages: + - r_allocate_resources + - r_build_and_test + - r_release_resources + - l_build_and_test + - b_build_and_test + - c_build_and_test + - multi_project + +# This is the rules that drives the activation of "advanced" jobs. All advanced +# jobs will share this through a template mechanism. +.advanced_pipeline: + rules: + - if: '$CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH == "develop" || $ALL_TARGETS == "ON"' #run only if ... + +# These are also templates (.name) that define project specific build commands. +# If an allocation exist with the name defined in this pipeline, the job will +# use it (slurm specific). +.build_toss_3_x86_64_ib_script: + script: + - echo ${ALLOC_NAME} + - export JOBID=$(squeue -h --name=${ALLOC_NAME} --format=%A) + - echo ${JOBID} + - srun $( [[ -n "${JOBID}" ]] && echo "--jobid=${JOBID}" ) -t ${DEFAULT_TIME} -N 1 scripts/gitlab/build_and_test.sh + artifacts: + reports: + junit: junit.xml + +.build_toss_3_x86_64_ib_corona_script: + script: + - srun -p mi60 -t 30 -N 1 scripts/gitlab/build_and_test.sh + +# Lassen and Butte use a different job scheduler (spectrum lsf) that does not +# allow pre-allocation the same way slurm does. +.build_blueos_3_ppc64le_ib_script: + script: + - lalloc 1 -W ${DEFAULT_TIME} scripts/gitlab/build_and_test.sh + artifacts: + reports: + junit: junit.xml + +.build_blueos_3_ppc64le_ib_p9_script: + extends: .build_blueos_3_ppc64le_ib_script + +# If testing develop branch, trigger CHAI pipeline with this version of RAJA. +# TODO: Once spack allows to clone a specific commit on demand, then point to the exact commit. +# This will prevent from sticking to a branch (here develop). +# To turn back on chai trigger, add '$CI_COMMIT_BRANCH == "develop" to rule. +trigger-chai: + stage: multi_project + rules: + - if: '$MULTI_PROJECT == "ON"' #run only if ... + variables: + UPDATE_RAJA: develop + trigger: + project: radiuss/chai + branch: develop + strategy: depend + +# This is where jobs are included. +include: + - local: .gitlab/ruby-templates.yml + - local: .gitlab/ruby-jobs.yml + - local: .gitlab/lassen-templates.yml + - local: .gitlab/lassen-jobs.yml + - local: .gitlab/corona-templates.yml + - local: .gitlab/corona-jobs.yml diff --git a/.gitlab/corona-jobs.yml b/.gitlab/corona-jobs.yml new file mode 100644 index 000000000..2b6167389 --- /dev/null +++ b/.gitlab/corona-jobs.yml @@ -0,0 +1,21 @@ +############################################################################# +## Copyright (c) 2016-21, Lawrence Livermore National Security, LLC +## and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +## +## SPDX-License-Identifier: (BSD-3-Clause) +############################################################################## + +hip_4_0_gcc_8_1_0 (build and test on corona): + variables: + SPEC: "+hip~openmp %gcc@8.1.0 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000' ^hip@4.0.0" + extends: .build_and_test_on_corona + +hip_4_1_gcc_8_1_0 (build and test on corona): + variables: + SPEC: "+hip~openmp %gcc@8.1.0 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000' ^hip@4.1.0" + extends: .build_and_test_on_corona + +hip_4_1_clang_9_0_0 (build and test on corona): + variables: + SPEC: "+hip~openmp %clang@9.0.0 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 ^hip@4.1.0" + extends: .build_and_test_on_corona diff --git a/.gitlab/corona-templates.yml b/.gitlab/corona-templates.yml new file mode 100644 index 000000000..9a6405724 --- /dev/null +++ b/.gitlab/corona-templates.yml @@ -0,0 +1,32 @@ +############################################################################# +## Copyright (c) 2016-21, Lawrence Livermore National Security, LLC +## and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +## +## SPDX-License-Identifier: (BSD-3-Clause) +############################################################################## + +### +# This is the share configuration of jobs for corona + +#### +# In pre-build phase, allocate a node for builds +.on_corona: + tags: + - shell + - corona + rules: + - if: '$ON_CORONA == "OFF"' #run except if ... + when: never + - if: '$CI_JOB_NAME =~ /release_resources/' + when: always + - when: on_success + +### +# Generic corona build job, extending build script +.build_and_test_on_corona: + stage: c_build_and_test + extends: [.build_toss_3_x86_64_ib_corona_script, .on_corona] + needs: [] + +.build_and_test_on_corona_advanced: + extends: [.build_and_test_on_corona, .advanced_pipeline] diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml new file mode 100644 index 000000000..95e20a891 --- /dev/null +++ b/.gitlab/lassen-jobs.yml @@ -0,0 +1,85 @@ +############################################################################## +## Copyright (c) 2016-21, Lawrence Livermore National Security, LLC +## and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +## +## SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +########### +## CPU ONLY +########### + +ibm_clang_9: + variables: + SPEC: "%clang@9.0.0ibm" + extends: .build_and_test_on_lassen + +ibm_clang_9_gcc_8: + variables: + SPEC: "%clang@9.0.0ibm cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" + extends: .build_and_test_on_lassen + +gcc_8_3_1: + variables: + SPEC: "%gcc@8.3.1 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000'" + extends: .build_and_test_on_lassen + +xl_16_1_1_7: + variables: + SPEC: "%xl@16.1.1.7 cxxflags='-qthreaded -std=c++14 -O3 -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036'" + DEFAULT_TIME: 50 + extends: .build_and_test_on_lassen + +xl_16_1_1_7_gcc_8_3_1: + variables: + SPEC: "%xl@16.1.1.7 cxxflags='--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O3 -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" + DEFAULT_TIME: 50 + extends: .build_and_test_on_lassen + +########## +## CUDA +########### + +ibm_clang_9_cuda: + variables: + SPEC: "+cuda+allow-untested-versions cuda_arch=70 %clang@9.0.0ibm ^cuda@10.1.168" + extends: .build_and_test_on_lassen + +ibm_clang_9_gcc_8_cuda: + variables: + SPEC: "+cuda %clang@9.0.0ibm cuda_arch=70 +allow-untested-versions cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@10.1.168" + extends: .build_and_test_on_lassen + +gcc_8_3_1_cuda: + variables: + SPEC: "+cuda %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168" + extends: .build_and_test_on_lassen + +xl_16_1_1_7_cuda: + variables: + SPEC: "+cuda %xl@16.1.1.7 cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5" + DEFAULT_TIME: 60 + allow_failure: true + extends: .build_and_test_on_lassen + +xl_16_1_1_7_gcc_8_3_1_cuda_11: + variables: + SPEC: "+cuda %xl@16.1.1.7 cuda_arch=70 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@11.0.2 ^cmake@3.14.5" + DEFAULT_TIME: 60 + allow_failure: true + extends: .build_and_test_on_lassen + +########## +## EXTRAS +########### + +clang_9_0_0_libcpp (build and test on lassen): + variables: + SPEC: "%clang@9.0.0+libcpp" + extends: .build_and_test_on_lassen + +clang_9_0_0_memleak (build and test on lassen): + variables: + SPEC: "%clang@9.0.0 cxxflags=-fsanitize=address" + ASAN_OPTIONS: "detect_leaks=1" + extends: .build_and_test_on_lassen diff --git a/.gitlab/lassen-templates.yml b/.gitlab/lassen-templates.yml new file mode 100644 index 000000000..aa3027b48 --- /dev/null +++ b/.gitlab/lassen-templates.yml @@ -0,0 +1,29 @@ +############################################################################## +## Copyright (c) 2016-21, Lawrence Livermore National Security, LLC +## and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +## +## SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +#### +# Shared configuration of jobs for lassen +.on_lassen: + variables: + tags: + - shell + - lassen + rules: + - if: '$CI_COMMIT_BRANCH =~ /_lnone/ || $ON_LASSEN == "OFF"' #run except if ... + when: never + - when: on_success + +.build_and_test_on_lassen: + stage: l_build_and_test + extends: [.build_blueos_3_ppc64le_ib_p9_script, .on_lassen] + needs: [] + +# Note: .build_and_test_on_lassen_advanced inherits from +# .build_and_test_on_lassen and .advanced_pileline. +# In particular, the rules section will be merged. Careful when changing rules. +.build_and_test_on_lassen_advanced: + extends: [.build_and_test_on_lassen, .advanced_pipeline] diff --git a/.gitlab/ruby-jobs.yml b/.gitlab/ruby-jobs.yml new file mode 100644 index 000000000..2109043d4 --- /dev/null +++ b/.gitlab/ruby-jobs.yml @@ -0,0 +1,48 @@ +############################################################################## +## Copyright (c) 2016-21, Lawrence Livermore National Security, LLC +## and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +## +## SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +clang_10: + variables: + SPEC: "%clang@10.0.1" + extends: .build_and_test_on_ruby + +clang_9: + variables: + SPEC: "%clang@9.0.0" + extends: .build_and_test_on_ruby + +gcc_8_1_0: + variables: + SPEC: "%gcc@8.1.0" + DEFAULT_TIME: 60 + extends: .build_and_test_on_ruby + +icpc_17_0_2: + variables: + SPEC: "%intel@17.0.2" + DEFAULT_TIME: 40 + extends: .build_and_test_on_ruby + +icpc_18_0_2: + variables: + SPEC: " tests=none %intel@18.0.2" + DEFAULT_TIME: 40 + extends: .build_and_test_on_ruby + +icpc_19_1_0: + variables: + SPEC: "%intel@19.1.0" + DEFAULT_TIME: 40 + extends: .build_and_test_on_ruby + +# EXTRAS + +gcc_4_9_3: + variables: + SPEC: "%gcc@4.9.3" + DEFAULT_TIME: 60 + extends: .build_and_test_on_ruby diff --git a/.gitlab/ruby-templates.yml b/.gitlab/ruby-templates.yml new file mode 100644 index 000000000..ae4079c67 --- /dev/null +++ b/.gitlab/ruby-templates.yml @@ -0,0 +1,54 @@ +############################################################################## +## Copyright (c) 2016-21, Lawrence Livermore National Security, LLC +## and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +## +## SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +#### +# This is the shared configuration of jobs for ruby + +#### +# In pre-build phase, allocate a node for builds +.on_ruby: + tags: + - shell + - ruby + rules: + - if: '$CI_COMMIT_BRANCH =~ /_qnone/ || $ON_RUBY == "OFF"' #run except if ... + when: never + - if: '$CI_JOB_NAME =~ /release_resources/' + when: always + - when: on_success + +### +# In pre-build phase, allocate a node for builds +# NOTE: Not specifying 'salloc -c 56' should allocate the max number of CPU cores +allocate_resources (on ruby): + variables: + GIT_STRATEGY: none + extends: .on_ruby + stage: r_allocate_resources + script: + - salloc -N 1 -p pdebug -t 45 --no-shell --job-name=${ALLOC_NAME} + +### +# In post-build phase, deallocate resources +# Note : make sure this is run even on build phase failure +release_resources (on ruby): + variables: + GIT_STRATEGY: none + extends: .on_ruby + stage: r_release_resources + script: + - export JOBID=$(squeue -h --name=${ALLOC_NAME} --format=%A) + - ([[ -n "${JOBID}" ]] && scancel ${JOBID}) + +### +# Generic ruby build job, extending build script +.build_and_test_on_ruby: + extends: [.build_toss_3_x86_64_ib_script, .on_ruby] + stage: r_build_and_test + +.build_and_test_on_ruby_advanced: + extends: [.build_and_test_on_ruby, .advanced_pipeline] diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh new file mode 100755 index 000000000..b880ffa97 --- /dev/null +++ b/scripts/gitlab/build_and_test.sh @@ -0,0 +1,131 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2016-21, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +set -o errexit +set -o nounset + +option=${1:-""} +hostname="$(hostname)" +truehostname=${hostname//[0-9]/} +project_dir="$(pwd)" + +build_root=${BUILD_ROOT:-""} +hostconfig=${HOST_CONFIG:-""} +spec=${SPEC:-""} +job_unique_id=${CI_JOB_ID:-""} + +sys_type=${SYS_TYPE:-""} +py_env_path=${PYTHON_ENVIRONMENT_PATH:-""} + +# Dependencies +date +if [[ "${option}" != "--build-only" && "${option}" != "--test-only" ]] +then + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~~ Building Dependencies" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + + if [[ -z ${spec} ]] + then + echo "SPEC is undefined, aborting..." + exit 1 + fi + + prefix_opt="" + + if [[ -d /dev/shm ]] + then + prefix="/dev/shm/${hostname}" + if [[ -z ${job_unique_id} ]]; then + job_unique_id=manual_job_$(date +%s) + while [[ -d ${prefix}/${job_unique_id} ]] ; do + sleep 1 + job_unique_id=manual_job_$(date +%s) + done + fi + + prefix="${prefix}/${job_unique_id}" + mkdir -p ${prefix} + prefix_opt="--prefix=${prefix}" + fi + + python scripts/uberenv/uberenv.py --spec="${spec}" ${prefix_opt} + +fi +date + +# Host config file +if [[ -z ${hostconfig} ]] +then + # If no host config file was provided, we assume it was generated. + # This means we are looking of a unique one in project dir. + hostconfigs=( $( ls "${project_dir}/"hc-*.cmake ) ) + if [[ ${#hostconfigs[@]} == 1 ]] + then + hostconfig_path=${hostconfigs[0]} + echo "Found host config file: ${hostconfig_path}" + elif [[ ${#hostconfigs[@]} == 0 ]] + then + echo "No result for: ${project_dir}/hc-*.cmake" + echo "Spack generated host-config not found." + exit 1 + else + echo "More than one result for: ${project_dir}/hc-*.cmake" + echo "${hostconfigs[@]}" + echo "Please specify one with HOST_CONFIG variable" + exit 1 + fi +else + # Using provided host-config file. + hostconfig_path="${project_dir}/host-configs/${hostconfig}" +fi + +# Build Directory +if [[ -z ${build_root} ]] +then + build_root=$(pwd) +fi + +build_dir="${build_root}/build_${hostconfig//.cmake/}" + +# Build +if [[ "${option}" != "--deps-only" && "${option}" != "--test-only" ]] +then + date + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~ Host-config: ${hostconfig_path}" + echo "~ Build Dir: ${build_dir}" + echo "~ Project Dir: ${project_dir}" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~ ENV ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~~ Building RAJA PerfSuite" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + + # Map CPU core allocations + declare -A core_counts=(["lassen"]=40 ["ruby"]=28 ["corona"]=32) + + # If building, then delete everything first + # NOTE: 'cmake --build . -j core_counts' attempts to reduce individual build resources. + # If core_counts does not contain hostname, then will default to '-j ', which should + # use max cores. + rm -rf ${build_dir} 2>/dev/null + mkdir -p ${build_dir} && cd ${build_dir} + + date + cmake \ + -C ${hostconfig_path} \ + ${project_dir} + cmake --build . -j ${core_counts[$truehostname]} + date +fi From b507e7b2f90867ee07fa6251448da7ba99f86bb9 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Fri, 9 Jul 2021 12:38:48 -0700 Subject: [PATCH 025/392] adding the uberenv and radiuss stuff --- .gitmodules | 3 + scripts/radiuss-spack-configs/COPYRIGHT | 16 + scripts/radiuss-spack-configs/LICENSE | 21 + scripts/radiuss-spack-configs/NOTICE | 21 + scripts/radiuss-spack-configs/README.md | 47 + .../blueos_3_ppc64le_ib/compilers.yaml | 209 +++++ .../blueos_3_ppc64le_ib/packages.yaml | 55 ++ .../blueos_3_ppc64le_ib_p9 | 1 + scripts/radiuss-spack-configs/config.yaml | 80 ++ .../darwin/compilers.yaml | 65 ++ .../darwin/packages.yaml | 25 + .../toss_3_x86_64_ib/compilers.yaml | 290 +++++++ .../toss_3_x86_64_ib/packages.yaml | 90 ++ scripts/uberenv/LICENSE | 64 ++ scripts/uberenv/Makefile | 6 + scripts/uberenv/README.md | 19 + scripts/uberenv/docs/sphinx/conf.py | 324 +++++++ scripts/uberenv/docs/sphinx/index.rst | 194 +++++ scripts/uberenv/gen_spack_env_script.py | 128 +++ scripts/uberenv/packages/chai/package.py | 243 ++++++ scripts/uberenv/packages/hip/package.py | 54 ++ .../packages/raja_perfsuite/package.py | 338 ++++++++ scripts/uberenv/project.json | 10 + scripts/uberenv/spack_configs | 1 + scripts/uberenv/uberenv.py | 800 ++++++++++++++++++ 25 files changed, 3104 insertions(+) create mode 100644 scripts/radiuss-spack-configs/COPYRIGHT create mode 100644 scripts/radiuss-spack-configs/LICENSE create mode 100644 scripts/radiuss-spack-configs/NOTICE create mode 100644 scripts/radiuss-spack-configs/README.md create mode 100644 scripts/radiuss-spack-configs/blueos_3_ppc64le_ib/compilers.yaml create mode 100644 scripts/radiuss-spack-configs/blueos_3_ppc64le_ib/packages.yaml create mode 120000 scripts/radiuss-spack-configs/blueos_3_ppc64le_ib_p9 create mode 100644 scripts/radiuss-spack-configs/config.yaml create mode 100644 scripts/radiuss-spack-configs/darwin/compilers.yaml create mode 100644 scripts/radiuss-spack-configs/darwin/packages.yaml create mode 100644 scripts/radiuss-spack-configs/toss_3_x86_64_ib/compilers.yaml create mode 100644 scripts/radiuss-spack-configs/toss_3_x86_64_ib/packages.yaml create mode 100644 scripts/uberenv/LICENSE create mode 100644 scripts/uberenv/Makefile create mode 100644 scripts/uberenv/README.md create mode 100644 scripts/uberenv/docs/sphinx/conf.py create mode 100644 scripts/uberenv/docs/sphinx/index.rst create mode 100644 scripts/uberenv/gen_spack_env_script.py create mode 100644 scripts/uberenv/packages/chai/package.py create mode 100644 scripts/uberenv/packages/hip/package.py create mode 100644 scripts/uberenv/packages/raja_perfsuite/package.py create mode 100644 scripts/uberenv/project.json create mode 120000 scripts/uberenv/spack_configs create mode 100755 scripts/uberenv/uberenv.py diff --git a/.gitmodules b/.gitmodules index 13f05ecd3..babe9cd39 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "tpl/RAJA"] path = tpl/RAJA url = https://github.com/LLNL/RAJA.git +[submodule "scripts/radiuss-spack-configs"] + path = scripts/radiuss-spack-configs + url = https://github.com/LLNL/radiuss-spack-configs diff --git a/scripts/radiuss-spack-configs/COPYRIGHT b/scripts/radiuss-spack-configs/COPYRIGHT new file mode 100644 index 000000000..627879f05 --- /dev/null +++ b/scripts/radiuss-spack-configs/COPYRIGHT @@ -0,0 +1,16 @@ +Intellectual Property Notice +------------------------------ + +RADIUSS Spack Config is licensed under the MIT license (LICENSE). + +Copyrights and patents in the RADIUSS Spack Config project are retained by +contributors. No copyright assignment is required to contribute to RADIUSS +Spack Config. + + +SPDX usage +------------ + +Individual files contain SPDX tags instead of the full license text. +This enables machine processing of license information based on the SPDX +License Identifiers that are available here: https://spdx.org/licenses/ diff --git a/scripts/radiuss-spack-configs/LICENSE b/scripts/radiuss-spack-configs/LICENSE new file mode 100644 index 000000000..3af858be7 --- /dev/null +++ b/scripts/radiuss-spack-configs/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018, Lawrence Livermore National Security, LLC + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/scripts/radiuss-spack-configs/NOTICE b/scripts/radiuss-spack-configs/NOTICE new file mode 100644 index 000000000..3737d5a86 --- /dev/null +++ b/scripts/radiuss-spack-configs/NOTICE @@ -0,0 +1,21 @@ +This work was produced under the auspices of the U.S. Department of +Energy by Lawrence Livermore National Laboratory under Contract +DE-AC52-07NA27344. + +This work was prepared as an account of work sponsored by an agency of +the United States Government. Neither the United States Government nor +Lawrence Livermore National Security, LLC, nor any of their employees +makes any warranty, expressed or implied, or assumes any legal liability +or responsibility for the accuracy, completeness, or usefulness of any +information, apparatus, product, or process disclosed, or represents that +its use would not infringe privately owned rights. + +Reference herein to any specific commercial product, process, or service +by trade name, trademark, manufacturer, or otherwise does not necessarily +constitute or imply its endorsement, recommendation, or favoring by the +United States Government or Lawrence Livermore National Security, LLC. + +The views and opinions of authors expressed herein do not necessarily +state or reflect those of the United States Government or Lawrence +Livermore National Security, LLC, and shall not be used for advertising +or product endorsement purposes. diff --git a/scripts/radiuss-spack-configs/README.md b/scripts/radiuss-spack-configs/README.md new file mode 100644 index 000000000..82df80706 --- /dev/null +++ b/scripts/radiuss-spack-configs/README.md @@ -0,0 +1,47 @@ +# RADIUSS Spack Config + +The RADIUSS project promotes and supports key High Performance Computing (HPC) open-source software developed at the LLNL. These tools and libraries cover a wide range of features a team would need to develop a modern simulation code targeting HPC plaftorms. + +Radiuss Spack Config allows project to share a set of compilers and packages configurations for several machines. + +## Getting Started + +This project may be used as a submodule. + +### Installing + +This project requires no installation. + +## Contributing + +Please read [CONTRIBUTING.md](https://github.com/LLNL/radiuss-ci/CONTRIBUTING.md) for details on our code of conduct, and the process for submitting pull requests to us. + +## Versioning + +version: 1.0.0 + +TODO: Not even sure how to handle versioning here. + +## Authors + +Adrien M Bernede + +See also the list of [contributors](https://github.com/LLNL/radiuss-ci/contributors) who participated in this project. + +## License + +This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details + +All new contributions must be made under the MIT License. + +See [LICENSE](https://github.com/LLNL/radiuss-ci/blob/master/LICENSE), +[COPYRIGHT](https://github.com/LLNL/radiuss-ci/blob/master/COPYRIGHT), and +[NOTICE](https://github.com/LLNL/radiuss-ci/blob/master/NOTICE) for details. + +SPDX-License-Identifier: (MIT) + +LLNL-CODE-793462 + +## Acknowledgments + + diff --git a/scripts/radiuss-spack-configs/blueos_3_ppc64le_ib/compilers.yaml b/scripts/radiuss-spack-configs/blueos_3_ppc64le_ib/compilers.yaml new file mode 100644 index 000000000..7b8a2a4e1 --- /dev/null +++ b/scripts/radiuss-spack-configs/blueos_3_ppc64le_ib/compilers.yaml @@ -0,0 +1,209 @@ +compilers: +- compiler: + spec: clang@3.9.1 + paths: + cc: /usr/tcetmp/packages/clang/clang-3.9.1/bin/clang + cxx: /usr/tcetmp/packages/clang/clang-3.9.1/bin/clang++ + f77: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r + fc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r + flags: {} + operating_system: rhel7 + target: ppc64le + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: clang@4.0.0 + paths: + cc: /usr/tcetmp/packages/clang/clang-4.0.0/bin/clang + cxx: /usr/tcetmp/packages/clang/clang-4.0.0/bin/clang++ + f77: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r + fc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r + flags: {} + operating_system: rhel7 + target: ppc64le + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: clang@9.0.0 + paths: + cc: /usr/tce/packages/clang/clang-9.0.0/bin/clang + cxx: /usr/tce/packages/clang/clang-9.0.0/bin/clang++ + f77: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r + fc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r + flags: {} + operating_system: rhel7 + target: ppc64le + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: clang@9.0.0ibm + paths: + cc: /usr/tce/packages/clang/clang-ibm-2019.10.03/bin/clang + cxx: /usr/tce/packages/clang/clang-ibm-2019.10.03/bin/clang++ + fc: /usr/tce/packages/xl/xl-2020.03.18/bin/xlf2003_r + f77: /usr/tce/packages/xl/xl-2020.03.18/bin/xlf_r + flags: {} + operating_system: rhel7 + target: ppc64le + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: clang@10.0.1ibm + paths: + cc: /usr/tce/packages/clang/clang-ibm-10.0.1/bin/clang + cxx: /usr/tce/packages/clang/clang-ibm-10.0.1/bin/clang++ + fc: /usr/tce/packages/xl/xl-2020.09.17/bin/xlf2003_r + f77: /usr/tce/packages/xl/xl-2020.09.17/bin/xlf_r + flags: {} + operating_system: rhel7 + target: ppc64le + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: clang@coral2018.08.08 + paths: + cc: /usr/tce/packages/clang/clang-coral-2018.08.08/bin/clang + cxx: /usr/tce/packages/clang/clang-coral-2018.08.08/bin/clang++ + f77: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r + fc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r + flags: {} + operating_system: rhel7 + target: ppc64le + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: clang@default + paths: + cc: clang + cxx: clang++ + f77: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r + fc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r + flags: {} + operating_system: rhel7 + target: ppc64le + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: gcc@8.3.1 + paths: + cc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gcc + cxx: /usr/tce/packages/gcc/gcc-8.3.1/bin/g++ + f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + flags: {} + operating_system: rhel7 + target: ppc64le + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: gcc@4.9.3 + paths: + cc: /usr/tce/packages/gcc/gcc-4.9.3/bin/gcc + cxx: /usr/tce/packages/gcc/gcc-4.9.3/bin/g++ + f77: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran + flags: {} + operating_system: rhel7 + target: ppc64le + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: gcc@default + paths: + cc: gcc + cxx: g++ + f77: gfortran + fc: gfortran + flags: {} + operating_system: rhel7 + target: ppc64le + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: xl@default + paths: + cc: xlc + cxx: xlc++ + f77: xlf2003 + fc: xlf2003 + flags: {} + operating_system: rhel7 + target: ppc64le + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: xl@beta2019.06.20 + paths: + cc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlc + cxx: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlc++ + f77: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r + fc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r + flags: {} + operating_system: rhel7 + target: ppc64le + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: xl@16.1.1.7 + paths: + cc: /usr/tce/packages/xl/xl-2020.03.18/bin/xlc_r + cxx: /usr/tce/packages/xl/xl-2020.03.18/bin/xlC_r + fc: /usr/tce/packages/xl/xl-2020.03.18/bin/xlf2003_r + f77: /usr/tce/packages/xl/xl-2020.03.18/bin/xlf_r + flags: {} + operating_system: rhel7 + target: ppc64le + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: pgi@default + paths: + cc: pgcc + cxx: pgc++ + f77: pgfortran + fc: pgfortran + flags: {} + operating_system: rhel7 + target: ppc64le + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: pgi@19.10 + paths: + cc: /usr/tce/packages/pgi/pgi-19.10/bin/pgcc + cxx: /usr/tce/packages/pgi/pgi-19.10/bin/pgc++ + f77: /usr/tce/packages/pgi/pgi-19.10/bin/pgfortran + fc: /usr/tce/packages/pgi/pgi-19.10/bin/pgfortran + flags: {} + operating_system: rhel7 + target: ppc64le + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: pgi@20.4 + paths: + cc: /usr/tce/packages/pgi/pgi-20.4/bin/pgcc + cxx: /usr/tce/packages/pgi/pgi-20.4/bin/pgc++ + f77: /usr/tce/packages/pgi/pgi-20.4/bin/pgfortran + fc: /usr/tce/packages/pgi/pgi-20.4/bin/pgf90 + flags: {} + operating_system: rhel7 + target: ppc64le + modules: [] + environment: {} + extra_rpaths: [] diff --git a/scripts/radiuss-spack-configs/blueos_3_ppc64le_ib/packages.yaml b/scripts/radiuss-spack-configs/blueos_3_ppc64le_ib/packages.yaml new file mode 100644 index 000000000..1fe54dcfd --- /dev/null +++ b/scripts/radiuss-spack-configs/blueos_3_ppc64le_ib/packages.yaml @@ -0,0 +1,55 @@ +packages: + all: + # This defaults us to machine specific flags of ivybridge which allows + # us to run on broadwell as well + target: [ppc64le] + compiler: [gcc, pgi, clang, xl] + cmake: + version: [3.18.0, 3.14.5] + buildable: false + externals: + - spec: cmake@3.14.5 + prefix: /usr/tce/packages/cmake/cmake-3.14.5 + - spec: cmake@3.18.0 + prefix: /usr/tce/packages/cmake/cmake-3.18.0 + cuda: + version: [11.0.2, 10.1.243, 10.1.168, 9.2.148, 8.0] + buildable: false + externals: + - spec: cuda@11.0.2 + prefix: /usr/tce/packages/cuda/cuda-11.0.2 + - spec: cuda@10.1.243 + prefix: /usr/tce/packages/cuda/cuda-10.1.243 + - spec: cuda@10.1.168 + prefix: /usr/tce/packages/cuda/cuda-10.1.168 + - spec: cuda@9.2.148 + prefix: /usr/tce/packages/cuda/cuda-9.2.148 + - spec: cuda@8.0 + prefix: /usr/tce/packages/cuda/cuda-8.0 + spectrum-mpi: + externals: + - spec: spectrum-mpi@10.3.1.03rtm0%pgi@19.10 + prefix: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-pgi-19.10 + - spec: spectrum-mpi@10.3.1.03rtm0%pgi@20.4 + prefix: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-pgi-20.4 + - spec: spectrum-mpi@10.3.1.03rtm0%gcc@8.3.1 + prefix: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-8.3.1 + - spec: spectrum-mpi@10.3.1.03rtm0%gcc@4.9.3 + prefix: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-4.9.3 + - spec: spectrum-mpi@10.3.1.03rtm0%clang@9.0.0 + prefix: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-clang-9.0.0 + - spec: spectrum-mpi@10.3.1.03rtm0%clang@9.0.0ibm + prefix: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-clang-ibm-2019.10.03 + - spec: spectrum-mpi@10.3.1.03rtm0%clang@10.0.1ibm + prefix: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-clang-ibm-10.0.1 + - spec: spectrum-mpi@10.3.1.03rtm0%xl@16.1.1.7 + prefix: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-xl-2020.03.18 + - spec: spectrum-mpi@10.3.1.03rtm0%xl@beta2019.06.20 + prefix: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-xl-beta-2019.06.20 + buildable: false + python: + buildable: false + version: [3.8.2] + externals: + - spec: python@3.8.2 + prefix: /usr/tce/packages/python/python-3.8.2 diff --git a/scripts/radiuss-spack-configs/blueos_3_ppc64le_ib_p9 b/scripts/radiuss-spack-configs/blueos_3_ppc64le_ib_p9 new file mode 120000 index 000000000..f06fef9d5 --- /dev/null +++ b/scripts/radiuss-spack-configs/blueos_3_ppc64le_ib_p9 @@ -0,0 +1 @@ +blueos_3_ppc64le_ib \ No newline at end of file diff --git a/scripts/radiuss-spack-configs/config.yaml b/scripts/radiuss-spack-configs/config.yaml new file mode 100644 index 000000000..2095112ff --- /dev/null +++ b/scripts/radiuss-spack-configs/config.yaml @@ -0,0 +1,80 @@ +# ------------------------------------------------------------------------- +# This is the default spack configuration file. +# +# Settings here are versioned with Spack and are intended to provide +# sensible defaults out of the box. Spack maintainers should edit this +# file to keep it current. +# +# Users can override these settings by editing the following files. +# +# Per-spack-instance settings (overrides defaults): +# $SPACK_ROOT/etc/spack/config.yaml +# +# Per-user settings (overrides default and site settings): +# ~/.spack/config.yaml +# ------------------------------------------------------------------------- +config: + # This is the path to the root of the Spack install tree. + # You can use $spack here to refer to the root of the spack instance. + install_tree: $spack/.. + + # install directory layout + install_path_scheme: "${COMPILERNAME}-${COMPILERVER}/${PACKAGE}-${VERSION}" + +# Locations where templates should be found + template_dirs: + - $spack/templates + + # Locations where different types of modules should be installed. + module_roots: + tcl: $spack/share/spack/modules + lmod: $spack/share/spack/lmod + + + # Temporary locations Spack can try to use for builds. + # + # Spack will use the first one it finds that exists and is writable. + # You can use $tempdir to refer to the system default temp directory + # (as returned by tempfile.gettempdir()). + # + # A value of $spack/var/spack/stage indicates that Spack should run + # builds directly inside its install directory without staging them in + # temporary space. + # + # The build stage can be purged with `spack purge --stage`. + build_stage: + # skipping tempdir b/c running mpi tests fails with local fs + # - $tempdir + - $spack/../builds + + + # Cache directory already downloaded source tarballs and archived + # repositories. This can be purged with `spack purge --downloads`. + source_cache: $spack/var/spack/cache + + + # Cache directory for miscellaneous files, like the package index. + # This can be purged with `spack purge --misc-cache` + misc_cache: .spack/misccache + + + # If this is false, tools like curl that use SSL will not verify + # certifiates. (e.g., curl will use use the -k option) + verify_ssl: true + + + # If set to true, Spack will always check checksums after downloading + # archives. If false, Spack skips the checksum step. + checksum: true + + + # If set to true, `spack install` and friends will NOT clean + # potentially harmful variables from the build environment. Use wisely. + dirty: false + + + # The default number of jobs to use when running `make` in parallel. + # If set to 4, for example, `spack install` will run `make -j4`. + # If not set, all available cores are used by default. + # for uberenv, limit build_jobs to 8 + build_jobs: 8 diff --git a/scripts/radiuss-spack-configs/darwin/compilers.yaml b/scripts/radiuss-spack-configs/darwin/compilers.yaml new file mode 100644 index 000000000..ed5cbf020 --- /dev/null +++ b/scripts/radiuss-spack-configs/darwin/compilers.yaml @@ -0,0 +1,65 @@ +compilers: +- compiler: + environment: {} + extra_rpaths: [] + flags: {} + modules: [] + operating_system: elcapitan + paths: + cc: /usr/bin/clang + cxx: /usr/bin/clang++ + f77: /usr/local/bin/gfortran + fc: /usr/local/bin/gfortran + spec: clang@7.3.0-apple +- compiler: + environment: {} + extra_rpaths: [] + flags: {} + modules: [] + operating_system: sierra + paths: + cc: /usr/bin/clang + cxx: /usr/bin/clang++ + f77: /usr/local/bin/gfortran + fc: /usr/local/bin/gfortran + spec: clang@8.0.0-apple + target: x86_64 +- compiler: + environment: {} + extra_rpaths: [] + flags: {} + modules: [] + operating_system: highsierra + paths: + cc: /usr/bin/clang + cxx: /usr/bin/clang++ + f77: /usr/local/bin/gfortran + fc: /usr/local/bin/gfortran + spec: clang@9.0.0-apple + target: x86_64 +- compiler: + environment: {} + extra_rpaths: [] + flags: {} + modules: [] + operating_system: mojave + paths: + cc: /usr/bin/clang + cxx: /usr/bin/clang++ + f77: /usr/local/bin/gfortran + fc: /usr/local/bin/gfortran + spec: clang@10.0.0-apple + target: x86_64 +- compiler: + environment: {} + extra_rpaths: [] + flags: {} + modules: [] + operating_system: mojave + paths: + cc: /usr/local/opt/llvm/bin/clang + cxx: /usr/local/opt/llvm/bin/clang++ + f77: /usr/local/bin/gfortran + fc: /usr/local/bin/gfortran + spec: clang@10.0.0 + target: x86_64 diff --git a/scripts/radiuss-spack-configs/darwin/packages.yaml b/scripts/radiuss-spack-configs/darwin/packages.yaml new file mode 100644 index 000000000..6e965957c --- /dev/null +++ b/scripts/radiuss-spack-configs/darwin/packages.yaml @@ -0,0 +1,25 @@ + +# ------------------------------------------------------------------------- +# This file controls default concretization preferences for Spack. +# +# Settings here are versioned with Spack and are intended to provide +# sensible defaults out of the box. Spack maintainers should edit this +# file to keep it current. +# +# Users can override these settings by editing the following files. +# +# Per-spack-instance settings (overrides defaults): +# $SPACK_ROOT/etc/spack/packages.yaml +# +# Per-user settings (overrides default and site settings): +# ~/.spack/packages.yaml +# ------------------------------------------------------------------------- +packages: + all: + compiler: [clang] + +# cmake: +# version: [3.17.2] +# paths: +# cmake@3.17.2: /usr/local/Cellar/cmake/3.17.2 +# buildable: false diff --git a/scripts/radiuss-spack-configs/toss_3_x86_64_ib/compilers.yaml b/scripts/radiuss-spack-configs/toss_3_x86_64_ib/compilers.yaml new file mode 100644 index 000000000..e4edc3329 --- /dev/null +++ b/scripts/radiuss-spack-configs/toss_3_x86_64_ib/compilers.yaml @@ -0,0 +1,290 @@ +compilers: +- compiler: + spec: clang@3.9.1 + paths: + cc: /usr/tce/packages/clang/clang-3.9.1/bin/clang + cxx: /usr/tce/packages/clang/clang-3.9.1/bin/clang++ + f77: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran + flags: {} + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: clang@4.0.0 + paths: + cc: /usr/tce/packages/clang/clang-4.0.0/bin/clang + cxx: /usr/tce/packages/clang/clang-4.0.0/bin/clang++ + f77: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran + flags: {} + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: clang@6.0.0 + paths: + cc: /usr/tce/packages/clang/clang-6.0.0/bin/clang + cxx: /usr/tce/packages/clang/clang-6.0.0/bin/clang++ + f77: /usr/tce/packages/gcc/gcc-8.1.0/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-8.1.0/bin/gfortran + flags: {} + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: clang@9.0.0 + paths: + cc: /usr/tce/packages/clang/clang-9.0.0/bin/clang + cxx: /usr/tce/packages/clang/clang-9.0.0/bin/clang++ + f77: /usr/tce/packages/gcc/gcc-8.1.0/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-8.1.0/bin/gfortran + flags: {} + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: clang@10.0.1 + paths: + cc: /usr/tce/packages/clang/clang-10.0.1/bin/clang + cxx: /usr/tce/packages/clang/clang-10.0.1/bin/clang++ + f77: /usr/tce/packages/gcc/gcc-8.1.0/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-8.1.0/bin/gfortran + flags: {} + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: gcc@4.9.3 + paths: + cc: /usr/tce/packages/gcc/gcc-4.9.3/bin/gcc + cxx: /usr/tce/packages/gcc/gcc-4.9.3/bin/g++ + f77: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran + flags: {} + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: intel@16.0.4 + paths: + cc: /usr/tce/packages/intel/intel-16.0.4/bin/icc + cxx: /usr/tce/packages/intel/intel-16.0.4/bin/icpc + f77: /usr/tce/packages/intel/intel-16.0.4/bin/ifort + fc: /usr/tce/packages/intel/intel-16.0.4/bin/ifort + flags: + cflags: -gcc-name=/usr/tce/packages/gcc/gcc-4.9.3/bin/gcc + cxxflags: -gcc-name=/usr/tce/packages/gcc/gcc-4.9.3/bin/g++ + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: intel@17.0.2 + paths: + cc: /usr/tce/packages/intel/intel-17.0.2/bin/icc + cxx: /usr/tce/packages/intel/intel-17.0.2/bin/icpc + f77: /usr/tce/packages/intel/intel-17.0.2/bin/ifort + fc: /usr/tce/packages/intel/intel-17.0.2/bin/ifort + flags: + cflags: -gcc-name=/usr/tce/packages/gcc/gcc-4.9.3/bin/gcc + cxxflags: -gcc-name=/usr/tce/packages/gcc/gcc-4.9.3/bin/g++ + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: intel@18.0.0 + paths: + cc: /usr/tce/packages/intel/intel-18.0.0/bin/icc + cxx: /usr/tce/packages/intel/intel-18.0.0/bin/icpc + f77: /usr/tce/packages/intel/intel-18.0.0/bin/ifort + fc: /usr/tce/packages/intel/intel-18.0.0/bin/ifort + flags: + cflags: -gcc-name=/usr/tce/packages/gcc/gcc-7.3.0/bin/gcc + cxxflags: -gcc-name=/usr/tce/packages/gcc/gcc-7.3.0/bin/g++ + fflags: -gcc-name=/usr/tce/packages/gcc/gcc-7.3.0/bin/gcc + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: intel@18.0.2 + paths: + cc: /usr/tce/packages/intel/intel-18.0.2/bin/icc + cxx: /usr/tce/packages/intel/intel-18.0.2/bin/icpc + f77: /usr/tce/packages/intel/intel-18.0.2/bin/ifort + fc: /usr/tce/packages/intel/intel-18.0.2/bin/ifort + flags: + cflags: -gcc-name=/usr/tce/packages/gcc/gcc-7.3.0/bin/gcc + cxxflags: -gcc-name=/usr/tce/packages/gcc/gcc-7.3.0/bin/g++ + fflags: -gcc-name=/usr/tce/packages/gcc/gcc-7.3.0/bin/gcc + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: intel@19.0.4 + paths: + cc: /usr/tce/packages/intel/intel-19.0.4/bin/icc + cxx: /usr/tce/packages/intel/intel-19.0.4/bin/icpc + f77: /usr/tce/packages/intel/intel-19.0.4/bin/ifort + fc: /usr/tce/packages/intel/intel-19.0.4/bin/ifort + flags: + cflags: -gcc-name=/usr/tce/packages/gcc/gcc-7.1.0/bin/gcc + cxxflags: -gcc-name=/usr/tce/packages/gcc/gcc-7.1.0/bin/g++ + fflags: -gcc-name=/usr/tce/packages/gcc/gcc-7.1.0/bin/g++ + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: intel@19.1.0 + paths: + cc: /usr/tce/packages/intel/intel-19.1.0/bin/icc + cxx: /usr/tce/packages/intel/intel-19.1.0/bin/icpc + f77: /usr/tce/packages/intel/intel-19.1.0/bin/ifort + fc: /usr/tce/packages/intel/intel-19.1.0/bin/ifort + flags: + cflags: -gcc-name=/usr/tce/packages/gcc/gcc-8.1.0/bin/gcc + cxxflags: -gcc-name=/usr/tce/packages/gcc/gcc-8.1.0/bin/g++ + fflags: -gcc-name=/usr/tce/packages/gcc/gcc-8.1.0/bin/gcc + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: pgi@17.10 + paths: + cc: /usr/tce/packages/pgi/pgi-17.10/bin/pgcc + cxx: /usr/tce/packages/pgi/pgi-17.10/bin/pgc++ + f77: /usr/tce/packages/pgi/pgi-17.10/bin/pgf77 + fc: /usr/tce/packages/pgi/pgi-17.10/bin/pgf95 + flags: {} + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: pgi@18.5 + paths: + cc: /usr/tce/packages/pgi/pgi-18.5/bin/pgcc + cxx: /usr/tce/packages/pgi/pgi-18.5/bin/pgc++ + f77: /usr/tce/packages/pgi/pgi-18.5/bin/pgf77 + fc: /usr/tce/packages/pgi/pgi-18.5/bin/pgf95 + flags: {} + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: pgi@19.4 + paths: + cc: /usr/tce/packages/pgi/pgi-19.4/bin/pgcc + cxx: /usr/tce/packages/pgi/pgi-19.4/bin/pgc++ + f77: /usr/tce/packages/pgi/pgi-19.4/bin/pgfortran + fc: /usr/tce/packages/pgi/pgi-19.4/bin/pgfortran + flags: {} + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: pgi@19.7 + paths: + cc: /usr/tce/packages/pgi/pgi-19.7/bin/pgcc + cxx: /usr/tce/packages/pgi/pgi-19.7/bin/pgc++ + f77: /usr/tce/packages/pgi/pgi-19.7/bin/pgfortran + fc: /usr/tce/packages/pgi/pgi-19.7/bin/pgf95 + flags: {} + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: pgi@20.1 + paths: + cc: /usr/tce/packages/pgi/pgi-20.1/bin/pgcc + cxx: /usr/tce/packages/pgi/pgi-20.1/bin/pgc++ + f77: /usr/tce/packages/pgi/pgi-20.1/bin/pgfortran + fc: /usr/tce/packages/pgi/pgi-20.1/bin/pgf95 + flags: {} + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: gcc@6.1.0 + paths: + cc: /usr/tce/packages/gcc/gcc-6.1.0/bin/gcc + cxx: /usr/tce/packages/gcc/gcc-6.1.0/bin/g++ + f77: /usr/tce/packages/gcc/gcc-6.1.0/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-6.1.0/bin/gfortran + flags: {} + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: gcc@7.1.0 + paths: + cc: /usr/tce/packages/gcc/gcc-7.1.0/bin/gcc + cxx: /usr/tce/packages/gcc/gcc-7.1.0/bin/g++ + f77: /usr/tce/packages/gcc/gcc-7.1.0/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-7.1.0/bin/gfortran + flags: {} + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: gcc@7.3.0 + paths: + cc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gcc + cxx: /usr/tce/packages/gcc/gcc-7.3.0/bin/g++ + f77: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran + flags: {} + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: gcc@8.1.0 + paths: + cc: /usr/tce/packages/gcc/gcc-8.1.0/bin/gcc + cxx: /usr/tce/packages/gcc/gcc-8.1.0/bin/g++ + f77: /usr/tce/packages/gcc/gcc-8.1.0/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-8.1.0/bin/gfortran + flags: {} + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] diff --git a/scripts/radiuss-spack-configs/toss_3_x86_64_ib/packages.yaml b/scripts/radiuss-spack-configs/toss_3_x86_64_ib/packages.yaml new file mode 100644 index 000000000..96462e950 --- /dev/null +++ b/scripts/radiuss-spack-configs/toss_3_x86_64_ib/packages.yaml @@ -0,0 +1,90 @@ +packages: + all: + # This defaults us to machine specific flags of ivybridge which allows + # us to run on broadwell as well + target: [ivybridge] + compiler: [gcc, intel, pgi, clang] + cmake: + version: [3.14.5] + buildable: false + + externals: + - spec: cmake + prefix: /usr/tce/packages/cmake/cmake-3.14.5 + cuda: + version: [10.1.168] + buildable: false + + externals: + - spec: cuda@10.1.168 + prefix: /usr/tce/packages/cuda/cuda-10.1.168 + hip: + version: [4.0.0, 4.1.0] + buildable: false + externals: + - spec: hip@4.0.0 + prefix: /opt/rocm-4.0.0/hip + - spec: hip@4.1.0 + prefix: /opt/rocm-4.1.0/hip + llvm-amdgpu: + version: [4.0.0, 4.1.0] + buildable: false + externals: + - spec: llvm-amdgpu@4.0.0 + prefix: /opt/rocm-4.0.0/llvm + - spec: llvm-amdgpu@4.1.0 + prefix: /opt/rocm-4.1.0/llvm + hsa-rocr-dev: + version: [4.0.0, 4.1.0] + buildable: false + externals: + - spec: hsa-rocr-dev@4.0.0 + prefix: /opt/rocm-4.0.0/ + - spec: hsa-rocr-dev@4.1.0 + prefix: /opt/rocm-4.1.0/ + rocminfo: + version: [4.0.0, 4.1.0] + buildable: false + externals: + - spec: rocminfo@4.0.0 + prefix: /opt/rocm-4.0.0/ + - spec: rocminfo@4.1.0 + prefix: /opt/rocm-4.1.0/ + rocm-device-libs: + version: [4.0.0, 4.1.0] + buildable: false + externals: + - spec: rocm-device-libs@4.0.0 + prefix: /opt/rocm-4.0.0/ + - spec: rocm-device-libs@4.1.0 + prefix: /opt/rocm-4.1.0/ + mvapich2: + externals: + - spec: mvapich2@2.3.1%clang@10.0.0~cuda~debug~regcache~wrapperrpath ch3_rank_bits=32 + file_systems=lustre,nfs,ufs process_managers=slurm + prefix: /usr/tce/packages/mvapich2/mvapich2-2.3-clang-10.0.0 + - spec: mvapich2@2.3.1%clang@9.0.0~cuda~debug~regcache~wrapperrpath ch3_rank_bits=32 + file_systems=lustre,nfs,ufs process_managers=slurm + prefix: /usr/tce/packages/mvapich2/mvapich2-2.3-clang-9.0.0 + - spec: mvapich2@2.3.1%pgi@19.7~cuda~debug~regcache~wrapperrpath ch3_rank_bits=32 + file_systems=lustre,nfs,ufs process_managers=slurm + prefix: /usr/tce/packages/mvapich2/mvapich2-2.3-pgi-19.7 + - spec: mvapich2@2.3.1%pgi@20.1~cuda~debug~regcache~wrapperrpath ch3_rank_bits=32 + file_systems=lustre,nfs,ufs process_managers=slurm + prefix: /usr/tce/packages/mvapich2/mvapich2-2.3-pgi-20.1 + - spec: mvapich2@2.3.1%intel@19.1.0.166~cuda~debug~regcache~wrapperrpath ch3_rank_bits=32 + file_systems=lustre,nfs,ufs process_managers=slurm + prefix: /usr/tce/packages/mvapich2/mvapich2-2.3-intel-19.1.0 + - spec: mvapich2@2.3.1%intel@18.0.2~cuda~debug~regcache~wrapperrpath ch3_rank_bits=32 + file_systems=lustre,nfs,ufs process_managers=slurm + prefix: /usr/tce/packages/mvapich2/mvapich2-2.3-intel-18.0.2 + - spec: mvapich2@2.3.1%intel@17.0.2~cuda~debug~regcache~wrapperrpath ch3_rank_bits=32 + file_systems=lustre,nfs,ufs process_managers=slurm + prefix: /usr/tce/packages/mvapich2/mvapich2-2.3-intel-17.0.2 + - spec: mvapich2@2.3.1%gcc@8.1.0~cuda~debug~regcache~wrapperrpath ch3_rank_bits=32 + file_systems=lustre,nfs,ufs process_managers=slurm + prefix: /usr/tce/packages/mvapich2/mvapich2-2.3-gcc-8.1.0 + - spec: mvapich2@2.3.1%gcc@4.9.3~cuda~debug~regcache~wrapperrpath ch3_rank_bits=32 + file_systems=lustre,nfs,ufs process_managers=slurm + prefix: /usr/tce/packages/mvapich2/mvapich2-2.3-gcc-4.9.3 + buildable: false diff --git a/scripts/uberenv/LICENSE b/scripts/uberenv/LICENSE new file mode 100644 index 000000000..fcd00312e --- /dev/null +++ b/scripts/uberenv/LICENSE @@ -0,0 +1,64 @@ +Copyright (c) 2014-2018, Lawrence Livermore National Security, LLC. + +Produced at the Lawrence Livermore National Laboratory + +LLNL-CODE-666778 + +All rights reserved. + +This file is part of Conduit. + +For details, see: http://software.llnl.gov/conduit/. + +Please also read conduit/LICENSE + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the disclaimer below. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the disclaimer (as noted below) in the + documentation and/or other materials provided with the distribution. + +* Neither the name of the LLNS/LLNL nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, +LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +Additional BSD Notice + + 1. This notice is required to be provided under our contract with the U.S. + Department of Energy (DOE). This work was produced at Lawrence + Livermore National Laboratory under Contract No. DE-AC52-07NA27344 with + the DOE. + + 2. Neither the United States Government nor Lawrence Livermore National + Security, LLC nor any of their employees, makes any warranty, express + or implied, or assumes any liability or responsibility for the + accuracy, completeness, or usefulness of any information, apparatus, + product, or process disclosed, or represents that its use would not + infringe privately-owned rights. + + 3. Also, reference herein to any specific commercial products, process, + or services by trade name, trademark, manufacturer or otherwise does + not necessarily constitute or imply its endorsement, recommendation, + or favoring by the United States Government or Lawrence Livermore + National Security, LLC. The views and opinions of authors expressed + herein do not necessarily state or reflect those of the United + States Government or Lawrence Livermore National Security, LLC, and + shall not be used for advertising or product endorsement purposes. + diff --git a/scripts/uberenv/Makefile b/scripts/uberenv/Makefile new file mode 100644 index 000000000..2760762d1 --- /dev/null +++ b/scripts/uberenv/Makefile @@ -0,0 +1,6 @@ + +default: + sphinx-build -E -a -b html docs/sphinx/ _docs_html + +clean: + rm -rf _docs_html \ No newline at end of file diff --git a/scripts/uberenv/README.md b/scripts/uberenv/README.md new file mode 100644 index 000000000..82d682017 --- /dev/null +++ b/scripts/uberenv/README.md @@ -0,0 +1,19 @@ +# uberenv +Automates using Spack (https://www.spack.io/) to build and deploy software. + +Uberenv is a short python script that helps automate using Spack to build +third-party dependencies for development and to deploy Spack packages. + +Uberenv was released as part of the Conduit (https://github.com/LLNL/conduit/). It is included in-source in several projects, this repo is used to hold the latest reference version. + +For more details, see Uberenv's documention: + +https://uberenv.readthedocs.io + +You can also find details about how it is used in Conduit's documentation: + +https://llnl-conduit.readthedocs.io/en/latest/building.html#building-conduit-and-third-party-dependencies + +Conduit's source repo also serves as an example for uberenv and spack configuration files, etc: + +https://github.com/LLNL/conduit/tree/master/scripts/uberenv diff --git a/scripts/uberenv/docs/sphinx/conf.py b/scripts/uberenv/docs/sphinx/conf.py new file mode 100644 index 000000000..a8475c7b8 --- /dev/null +++ b/scripts/uberenv/docs/sphinx/conf.py @@ -0,0 +1,324 @@ +# -*- coding: utf-8 -*- +# +############################################################################### +# Copyright (c) 2015-2019, Lawrence Livermore National Security, LLC. +# +# Produced at the Lawrence Livermore National Laboratory +# +# LLNL-CODE-666778 +# +# All rights reserved. +# +# This file is part of Conduit. +# +# For details, see: http://software.llnl.gov/conduit/. +# +# Please also read conduit/LICENSE +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the disclaimer below. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the disclaimer (as noted below) in the +# documentation and/or other materials provided with the distribution. +# +# * Neither the name of the LLNS/LLNL nor the names of its contributors may +# be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, +# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +############################################################################### +# +# Uberenv documentation build configuration file, created by +# sphinx-quickstart on Thu Oct 16 11:23:46 2014. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.doctest', + 'sphinx.ext.todo', + 'sphinx.ext.coverage', + 'sphinx.ext.mathjax' +] + +# try to add the breathe extension +try: + import breathe + extensions.append('breathe') +except: + pass + +# Add any paths that contain templates here, relative to this directory. +# templates_path = ['@CMAKE_CURRENT_SOURCE_DIR@/_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'Uberenv' +copyright = u'Copyright (c) 2015-2019, LLNS' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = 'current' +# The full version, including alpha/beta/rc tags. +release = 'current' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'sphinx_rtd_theme' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +html_theme_options = { 'logo_only' : True } + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +# html_static_path = ['@CMAKE_CURRENT_SOURCE_DIR@/_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'Uberenvdoc' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + ('index', 'Uberenv.tex', u'Uberenv Documentation', + u'LLNS', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'uberenv', u'Uberenv Documentation', + [u'LLNS'], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ('index', 'Uberenv', u'Uberenv Documentation', + u'LLNS', 'Uberenv', 'Automates using spack to build and deploy software.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False + + +# try to use the read the docs theme +try: + import sphinx_rtd_theme + html_theme = "sphinx_rtd_theme" + html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] +except: + pass diff --git a/scripts/uberenv/docs/sphinx/index.rst b/scripts/uberenv/docs/sphinx/index.rst new file mode 100644 index 000000000..457ec596d --- /dev/null +++ b/scripts/uberenv/docs/sphinx/index.rst @@ -0,0 +1,194 @@ +.. ############################################################################ +.. # Copyright (c) 2014-2018, Lawrence Livermore National Security, LLC. +.. # +.. # Produced at the Lawrence Livermore National Laboratory +.. # +.. # LLNL-CODE-666778 +.. # +.. # All rights reserved. +.. # +.. # This file is part of Conduit. +.. # +.. # For details, see: http://software.llnl.gov/conduit/. +.. # +.. # Please also read conduit/LICENSE +.. # +.. # Redistribution and use in source and binary forms, with or without +.. # modification, are permitted provided that the following conditions are met: +.. # +.. # * Redistributions of source code must retain the above copyright notice, +.. # this list of conditions and the disclaimer below. +.. # +.. # * Redistributions in binary form must reproduce the above copyright notice, +.. # this list of conditions and the disclaimer (as noted below) in the +.. # documentation and/or other materials provided with the distribution. +.. # +.. # * Neither the name of the LLNS/LLNL nor the names of its contributors may +.. # be used to endorse or promote products derived from this software without +.. # specific prior written permission. +.. # +.. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +.. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.. # ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, +.. # LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY +.. # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.. # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.. # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.. # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +.. # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +.. # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +.. # POSSIBILITY OF SUCH DAMAGE. +.. # +.. ############################################################################ + +.. _building_with_uberenv: + +Uberenv +~~~~~~~~~~~~~~~ + +**Uberenv** automates using `Spack `_ to build and deploy software. + +Many projects leverage `Spack `_ to help build the software dependencies needed to develop and deploy their projects on HPC systems. Uberenv is a python script that helps automate using Spack to build +third-party dependencies for development and to deploy Spack packages. + +Uberenv was released as part of Conduit (https://github.com/LLNL/conduit/). It is included in-source in several projects. The +https://github.com/llnl/uberenv/ repo is used to hold the latest reference version of Uberenv. + + +uberenv.py +~~~~~~~~~~~~~~~~~~~~~ + +``uberenv.py`` is a single file python script that automates fetching Spack, building and installing third party dependencies, and can optionally install packages as well. To automate the full install process, ``uberenv.py`` uses a target Spack package along with extra settings such as Spack compiler and external third party package details for common HPC platforms. + +``uberenv.py`` is included directly in a project's source code repo in the folder: ``scripts/uberenv/`` +This folder is also used to store extra Spack and Uberenv configuration files unique to the target project. ``uberenv.py`` uses a ``project.json`` file to specify project details, including the target Spack package name and which Spack repo is used. Conduit's source repo serves as an example for Uberenv and Spack configuration files, etc: + +https://github.com/LLNL/conduit/tree/master/scripts/uberenv + + +``uberenv.py`` is developed by LLNL in support of the `Ascent `_, Axom, and `Conduit `_ projects. + + +Command Line Options +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Build configuration +------------------- + +``uberenv.py`` has a few options that allow you to control how dependencies are built: + + ======================= ============================================== ================================================ + Option Description Default + ======================= ============================================== ================================================ + ``--prefix`` Destination directory ``uberenv_libs`` + ``--spec`` Spack spec linux: **%gcc** + osx: **%clang** + ``--spack-config-dir`` Folder with Spack settings files linux: (empty) + osx: ``scripts/uberenv/spack_configs/darwin/`` + ``-k`` Ignore SSL Errors **False** + ``--install`` Fully install target, not just dependencies **False** + ``--run_tests`` Invoke tests during build and against install **False** + ``--project-json`` File for project specific settings ``project.json`` + ======================= ============================================== ================================================ + +The ``-k`` option exists for sites where SSL certificate interception undermines fetching +from github and https hosted source tarballs. When enabled, ``uberenv.py`` clones Spack using: + +.. code:: bash + + git -c http.sslVerify=false clone https://github.com/llnl/spack.git + +And passes ``-k`` to any Spack commands that may fetch via https. + + +Default invocation on Linux: + +.. code:: bash + + python scripts/uberenv/uberenv.py --prefix uberenv_libs \ + --spec %gcc + +Default invocation on OSX: + +.. code:: bash + + python scripts/uberenv/uberenv.py --prefix uberenv_libs \ + --spec %clang \ + --spack-config-dir scripts/uberenv/spack_configs/darwin/ + + +Use the ``--install`` option to install the target package (not just its development dependencies): + +.. code:: bash + + python scripts/uberenv/uberenv.py --install + + +If the target Spack package supports Spack's testing hooks, you can run tests during the build process to validate the build and install, using the ``--run_tests`` option: + +.. code:: bash + + python scripts/uberenv/uberenv.py --install \ + --run_tests + +For details on Spack's spec syntax, see the `Spack Specs & dependencies `_ documentation. + + +Uberenv looks for configuration yaml files under ``scripts/uberenv/spack_config/{platform}`` or you can use the **--spack-config-dir** option to specify a directory with compiler and packages yaml files to use with Spack. See the `Spack Compiler Configuration `_ +and `Spack System Packages +`_ +documentation for details. + +.. note:: + The bootstrapping process ignores ``~/.spack/compilers.yaml`` to avoid conflicts + and surprises from a user's specific Spack settings on HPC platforms. + +When run, ``uberenv.py`` checkouts a specific version of Spack from github as ``spack`` in the +destination directory. It then uses Spack to build and install the target packages' dependencies into +``spack/opt/spack/``. Finally, the target package generates a host-config file ``{hostname}.cmake``, which is +copied to destination directory. This file specifies the compiler settings and paths to all of the dependencies. + + +Project configuration +--------------------- + +Part of the configuration can also be addressed using a json file. By default, it is named ``project.json`` and some settings can be overridden on command line: + + ==================== ========================== ================================================ ======================================= + Setting Option Description Default + ==================== ========================== ================================================ ======================================= + package_name ``--package-name`` Spack package name **None** + package_version **None** Spack package version **None** + package_final_phase ``--package-final-phase`` Controls after which phase Spack should stop **None** + package_source_dir ``--package-source-dir`` Controls the source directory Spack should use **None** + spack_url **None** Url where to download Spack ``https://github.com/spack/spack.git`` + spack_commit **None** Spack commit to checkout **None** + spack_activate **None** Spack packages to activate **None** + ==================== ========================== ================================================ ======================================= + + +Optimization +------------ + +``uberenv.py`` also features options to optimize the installation + + ==================== ============================================== ================================================ + Option Description Default + ==================== ============================================== ================================================ + ``--mirror`` Location of a Spack mirror **None** + ``--create-mirror`` Creates a Spack mirror at specified location **None** + ``--upstream`` Location of a Spack upstream **None** + ==================== ============================================== ================================================ + + +Project Settings +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A few notes on using ``uberenv.py`` in a new project: + +* For an example of how to craft a ``project.json`` file a target project, see: `Conduit's project.json file `_ + +* ``uberenv.py`` hot copies ``packages`` to the cloned Spack install, this allows you to easily version control any Spack package overrides necessary + + diff --git a/scripts/uberenv/gen_spack_env_script.py b/scripts/uberenv/gen_spack_env_script.py new file mode 100644 index 000000000..a1e6ba5d0 --- /dev/null +++ b/scripts/uberenv/gen_spack_env_script.py @@ -0,0 +1,128 @@ +############################################################################### +# Copyright (c) 2015-2019, Lawrence Livermore National Security, LLC. +# +# Produced at the Lawrence Livermore National Laboratory +# +# LLNL-CODE-716457 +# +# All rights reserved. +# +# This file is part of Ascent. +# +# For details, see: http://ascent.readthedocs.io/. +# +# Please also read ascent/LICENSE +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the disclaimer below. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the disclaimer (as noted below) in the +# documentation and/or other materials provided with the distribution. +# +# * Neither the name of the LLNS/LLNL nor the names of its contributors may +# be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, +# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +############################################################################### +import os +import sys +import subprocess + +from os.path import join as pjoin + +# if you have bad luck with spack load, this +# script is for you! +# +# Looks for subdir: spack or uberenv_libs/spack +# queries spack for given package names and +# creates a bash script that adds those to your path +# +# +# usage: +# python gen_spack_env_script.py [spack_pkg_1 spack_pkg_2 ...] +# + +def sexe(cmd,ret_output=False,echo = True): + """ Helper for executing shell commands. """ + if echo: + print("[exe: {}]".format(cmd)) + if ret_output: + p = subprocess.Popen(cmd, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + res = p.communicate()[0] + res = res.decode('utf8') + return p.returncode,res + else: + return subprocess.call(cmd,shell=True) + + +def spack_exe(spath=None): + if spath is None: + to_try = [pjoin("uberenv_libs","spack"), "spack"] + for p in to_try: + abs_p = os.path.abspath(p) + print("[looking for spack directory at: {}]".format(abs_p)) + if os.path.isdir(abs_p): + print("[FOUND spack directory at: {}]".format(abs_p)) + return os.path.abspath(pjoin(abs_p,"bin","spack")) + print("[ERROR: failed to find spack directory!]") + sys.exit(-1) + else: + spack_exe = os.path.abspath(spath,"bin","spack") + if not os.path.isfile(spack_exec): + print("[ERROR: failed to find spack directory at spath={}]").format(spath) + sys.exit(-1) + return spack_exe + +def find_pkg(pkg_name): + r,rout = sexe(spack_exe() + " find -p " + pkg_name,ret_output = True) + print(rout) + for l in rout.split("\n"): + print(l) + lstrip = l.strip() + if not lstrip == "" and \ + not lstrip.startswith("==>") and \ + not lstrip.startswith("--"): + return {"name": pkg_name, "path": l.split()[-1]} + print("[ERROR: failed to find package named '{}']".format(pkg_name)) + sys.exit(-1) + +def path_cmd(pkg): + return('export PATH={}:$PATH\n'.format((pjoin(pkg["path"],"bin")))) + +def write_env_script(pkgs): + ofile = open("s_env.sh","w") + for p in pkgs: + print("[found {} at {}]".format(p["name"],p["path"])) + ofile.write("# {}\n".format(p["name"])) + ofile.write(path_cmd(p)) + print("[created {}]".format(os.path.abspath("s_env.sh"))) + +def main(): + pkgs = [find_pkg(pkg) for pkg in sys.argv[1:]] + if len(pkgs) > 0: + write_env_script(pkgs) + else: + print("usage: python gen_spack_env_script.py spack_pkg_1 spack_pkg_2 ...") + +if __name__ == "__main__": + main() diff --git a/scripts/uberenv/packages/chai/package.py b/scripts/uberenv/packages/chai/package.py new file mode 100644 index 000000000..b20b1ac9f --- /dev/null +++ b/scripts/uberenv/packages/chai/package.py @@ -0,0 +1,243 @@ +# Copyright 2013-2020 Lawrence Livermore National Security, LLC and other +# Spack Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + + +from spack import * + +import socket +import os + +from os import environ as env +from os.path import join as pjoin + +def cmake_cache_entry(name, value, comment=""): + """Generate a string for a cmake cache variable""" + + return 'set(%s "%s" CACHE PATH "%s")\n\n' % (name,value,comment) + + +def cmake_cache_string(name, string, comment=""): + """Generate a string for a cmake cache variable""" + + return 'set(%s "%s" CACHE STRING "%s")\n\n' % (name,string,comment) + + +def cmake_cache_option(name, boolean_value, comment=""): + """Generate a string for a cmake configuration option""" + + value = "ON" if boolean_value else "OFF" + return 'set(%s %s CACHE BOOL "%s")\n\n' % (name,value,comment) + + +def get_spec_path(spec, package_name, path_replacements = {}, use_bin = False) : + """Extracts the prefix path for the given spack package + path_replacements is a dictionary with string replacements for the path. + """ + + if not use_bin: + path = spec[package_name].prefix + else: + path = spec[package_name].prefix.bin + + path = os.path.realpath(path) + + for key in path_replacements: + path = path.replace(key, path_replacements[key]) + + return path + + +class Chai(CMakePackage, CudaPackage): + """ + Copy-hiding array interface for data migration between memory spaces + """ + + homepage = "https://github.com/LLNL/CHAI" + git = "https://github.com/LLNL/CHAI.git" + + version('develop', branch='develop', submodules='True') + version('main', branch='main', submodules='True') + version('2.1.1', tag='v2.1.1', submodules='True') + version('2.1.0', tag='v2.1.0', submodules='True') + version('2.0.0', tag='v2.0.0', submodules='True') + version('1.2.0', tag='v1.2.0', submodules='True') + version('1.1.0', tag='v1.1.0', submodules='True') + version('1.0', tag='v1.0', submodules='True') + + variant('shared', default=True, description='Build Shared Libs') + + depends_on('cmake@3.8:', type='build') + depends_on('umpire') + + depends_on('cmake@3.9:', type='build', when="+cuda") + depends_on('umpire+cuda', when="+cuda") + + phases = ['hostconfig', 'cmake', 'build','install'] + + def _get_sys_type(self, spec): + sys_type = str(spec.architecture) + # if on llnl systems, we can use the SYS_TYPE + if "SYS_TYPE" in env: + sys_type = env["SYS_TYPE"] + return sys_type + + def _get_host_config_path(self, spec): + var='' + if '+cuda' in spec: + var= '-'.join([var,'cuda']) + + host_config_path = "hc-%s-%s-%s%s-%s.cmake" % (socket.gethostname().rstrip('1234567890'), + self._get_sys_type(spec), + spec.compiler, + var, + spec.dag_hash()) + dest_dir = self.stage.source_path + host_config_path = os.path.abspath(pjoin(dest_dir, host_config_path)) + return host_config_path + + def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): + """ + This method creates a 'host-config' file that specifies + all of the options used to configure and build CHAI. + + For more details about 'host-config' files see: + http://software.llnl.gov/conduit/building.html + + Note: + The `py_site_pkgs_dir` arg exists to allow a package that + subclasses this package provide a specific site packages + dir when calling this function. `py_site_pkgs_dir` should + be an absolute path or `None`. + + This is necessary because the spack `site_packages_dir` + var will not exist in the base class. For more details + on this issue see: https://github.com/spack/spack/issues/6261 + """ + + ####################### + # Compiler Info + ####################### + c_compiler = env["SPACK_CC"] + cpp_compiler = env["SPACK_CXX"] + + # Even though we don't have fortran code in our project we sometimes + # use the Fortran compiler to determine which libstdc++ to use + f_compiler = "" + if "SPACK_FC" in env.keys(): + # even if this is set, it may not exist + # do one more sanity check + if os.path.isfile(env["SPACK_FC"]): + f_compiler = env["SPACK_FC"] + + ####################################################################### + # By directly fetching the names of the actual compilers we appear + # to doing something evil here, but this is necessary to create a + # 'host config' file that works outside of the spack install env. + ####################################################################### + + sys_type = self._get_sys_type(spec) + + ############################################## + # Find and record what CMake is used + ############################################## + + cmake_exe = spec['cmake'].command.path + cmake_exe = os.path.realpath(cmake_exe) + + host_config_path = self._get_host_config_path(spec) + cfg = open(host_config_path, "w") + cfg.write("###################\n".format("#" * 60)) + cfg.write("# Generated host-config - Edit at own risk!\n") + cfg.write("###################\n".format("#" * 60)) + cfg.write("# Copyright (c) 2020, Lawrence Livermore National Security, LLC and\n") + cfg.write("# other CHAI Project Developers. See the top-level LICENSE file for\n") + cfg.write("# details.\n") + cfg.write("#\n") + cfg.write("# SPDX-License-Identifier: (BSD-3-Clause) \n") + cfg.write("###################\n\n".format("#" * 60)) + + cfg.write("#------------------\n".format("-" * 60)) + cfg.write("# SYS_TYPE: {0}\n".format(sys_type)) + cfg.write("# Compiler Spec: {0}\n".format(spec.compiler)) + cfg.write("# CMake executable path: %s\n" % cmake_exe) + cfg.write("#------------------\n\n".format("-" * 60)) + + ####################### + # Compiler Settings + ####################### + + cfg.write("#------------------\n".format("-" * 60)) + cfg.write("# Compilers\n") + cfg.write("#------------------\n\n".format("-" * 60)) + cfg.write(cmake_cache_entry("CMAKE_C_COMPILER", c_compiler)) + cfg.write(cmake_cache_entry("CMAKE_CXX_COMPILER", cpp_compiler)) + + # use global spack compiler flags + cflags = ' '.join(spec.compiler_flags['cflags']) + if cflags: + cfg.write(cmake_cache_entry("CMAKE_C_FLAGS", cflags)) + + cxxflags = ' '.join(spec.compiler_flags['cxxflags']) + if cxxflags: + cfg.write(cmake_cache_entry("CMAKE_CXX_FLAGS", cxxflags)) + + if ("gfortran" in f_compiler) and ("clang" in cpp_compiler): + libdir = pjoin(os.path.dirname( + os.path.dirname(f_compiler)), "lib") + flags = "" + for _libpath in [libdir, libdir + "64"]: + if os.path.exists(_libpath): + flags += " -Wl,-rpath,{0}".format(_libpath) + description = ("Adds a missing libstdc++ rpath") + if flags: + cfg.write(cmake_cache_entry("BLT_EXE_LINKER_FLAGS", flags, + description)) + + if "+cuda" in spec: + cfg.write("#------------------{0}\n".format("-" * 60)) + cfg.write("# Cuda\n") + cfg.write("#------------------{0}\n\n".format("-" * 60)) + + cfg.write(cmake_cache_option("ENABLE_CUDA", True)) + + cudatoolkitdir = spec['cuda'].prefix + cfg.write(cmake_cache_entry("CUDA_TOOLKIT_ROOT_DIR", + cudatoolkitdir)) + cudacompiler = "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc" + cfg.write(cmake_cache_entry("CMAKE_CUDA_COMPILER", + cudacompiler)) + + if not spec.satisfies('cuda_arch=none'): + cuda_arch = spec.variants['cuda_arch'].value + cuda_arch = "sm_{0}".format(cuda_arch[0]) + flag = '-arch {0}'.format(cuda_arch) + cfg.write(cmake_cache_string("CUDA_ARCH",cuda_arch)) + cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS", flag)) + + else: + cfg.write(cmake_cache_option("ENABLE_CUDA", False)) + + # shared vs static libs + cfg.write(cmake_cache_option("BUILD_SHARED_LIBS","+shared" in spec)) + + umpire_conf_path = spec['umpire'].prefix + "/share/umpire/cmake" + cfg.write(cmake_cache_entry("umpire_DIR",umpire_conf_path)) + + ####################### + # Close and save + ####################### + cfg.write("\n") + cfg.close() + + print("OUT: host-config file {0}".format(host_config_path)) + + def cmake_args(self): + spec = self.spec + host_config_path = self._get_host_config_path(spec) + + options = [] + options.extend(['-C', host_config_path]) + + return options diff --git a/scripts/uberenv/packages/hip/package.py b/scripts/uberenv/packages/hip/package.py new file mode 100644 index 000000000..e63317ec5 --- /dev/null +++ b/scripts/uberenv/packages/hip/package.py @@ -0,0 +1,54 @@ +from spack import * + + +class Hip(CMakePackage): + """HIP is a C++ Runtime API and Kernel Language that allows developers to + create portable applications for AMD and NVIDIA GPUs from + single source code.""" + + homepage = "https://github.com/ROCm-Developer-Tools/HIP" + url = "https://github.com/ROCm-Developer-Tools/HIP/archive/rocm-3.10.0.tar.gz" + + maintainers = ['srekolam', 'arjun-raj-kuppala'] + + version('3.10.0', sha256='0082c402f890391023acdfd546760f41cb276dffc0ffeddc325999fd2331d4e8') + version('3.9.0', sha256='25ad58691456de7fd9e985629d0ed775ba36a2a0e0b21c086bd96ba2fb0f7ed1') + + depends_on('cmake@3:', type='build') + depends_on('perl@5.10:', type=('build', 'run')) + depends_on('mesa~llvm@18.3:') + + for ver in ['3.9.0', '3.10.0']: + depends_on('rocclr@' + ver, type='build', when='@' + ver) + depends_on('hsakmt-roct@' + ver, type='build', when='@' + ver) + depends_on('hsa-rocr-dev@' + ver, type='link', when='@' + ver) + depends_on('comgr@' + ver, type='build', when='@' + ver) + depends_on('llvm-amdgpu@' + ver, type='build', when='@' + ver) + depends_on('rocm-device-libs@' + ver, type='build', when='@' + ver) + depends_on('rocminfo@' + ver, type='build', when='@' + ver) + + def setup_dependent_package(self, module, dependent_spec): + self.spec.hipcc = join_path(self.prefix.bin, 'hipcc') + + @run_before('install') + def filter_sbang(self): + perl = self.spec['perl'].command + kwargs = {'ignore_absent': False, 'backup': False, 'string': False} + + with working_dir('bin'): + match = '^#!/usr/bin/perl' + substitute = "#!{perl}".format(perl=perl) + files = [ + 'hipify-perl', 'hipcc', 'extractkernel', + 'hipconfig', 'hipify-cmakefile' + ] + filter_file(match, substitute, *files, **kwargs) + + def cmake_args(self): + args = [ + '-DHIP_COMPILER=clang', + '-DHIP_PLATFORM=rocclr', + '-DHSA_PATH={0}'.format(self.spec['hsa-rocr-dev'].prefix), + '-DLIBROCclr_STATIC_DIR={0}/lib'.format(self.spec['rocclr'].prefix) + ] + return args diff --git a/scripts/uberenv/packages/raja_perfsuite/package.py b/scripts/uberenv/packages/raja_perfsuite/package.py new file mode 100644 index 000000000..c4564aad7 --- /dev/null +++ b/scripts/uberenv/packages/raja_perfsuite/package.py @@ -0,0 +1,338 @@ +# Copyright 2013-2020 Lawrence Livermore National Security, LLC and other +# Spack Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + + +from spack import * + +import socket +import os + +from os import environ as env +from os.path import join as pjoin + +import re + +def cmake_cache_entry(name, value, comment=""): + """Generate a string for a cmake cache variable""" + + return 'set(%s "%s" CACHE PATH "%s")\n\n' % (name,value,comment) + + +def cmake_cache_string(name, string, comment=""): + """Generate a string for a cmake cache variable""" + + return 'set(%s "%s" CACHE STRING "%s")\n\n' % (name,string,comment) + + +def cmake_cache_option(name, boolean_value, comment=""): + """Generate a string for a cmake configuration option""" + + value = "ON" if boolean_value else "OFF" + return 'set(%s %s CACHE BOOL "%s")\n\n' % (name,value,comment) + + +def get_spec_path(spec, package_name, path_replacements = {}, use_bin = False) : + """Extracts the prefix path for the given spack package + path_replacements is a dictionary with string replacements for the path. + """ + + if not use_bin: + path = spec[package_name].prefix + else: + path = spec[package_name].prefix.bin + + path = os.path.realpath(path) + + for key in path_replacements: + path = path.replace(key, path_replacements[key]) + + return path + + +class RajaPerfSuite(CMakePackage, CudaPackage): + """RAJA PerfSuite.""" + + homepage = "https://github.com/LLNL/RAJAPerf/" + git = "https://github.com/LLNL/RAJAPerf/" + + version('develop', branch='develop', submodules='True') + version('main', branch='main', submodules='True') + + variant('openmp', default=True, description='Build OpenMP backend') + variant('shared', default=False, description='Build Shared Libs') + variant('libcpp', default=False, description='Uses libc++ instead of libstdc++') + variant('hip', default=False, description='Build with HIP support') + variant('tests', default='basic', values=('none', 'basic', 'benchmarks'), + multi=False, description='Tests to run') + + depends_on('cmake@3.8:', type='build') + depends_on('cmake@3.9:', when='+cuda', type='build') + depends_on('hip', when='+hip') + + conflicts('+openmp', when='+hip') + + phases = ['hostconfig', 'cmake', 'build', 'install'] + + def _get_sys_type(self, spec): + sys_type = str(spec.architecture) + # if on llnl systems, we can use the SYS_TYPE + if "SYS_TYPE" in env: + sys_type = env["SYS_TYPE"] + return sys_type + + def _get_host_config_path(self, spec): + var='' + if '+cuda' in spec: + var= '-'.join([var,'cuda']) + if '+libcpp' in spec: + var='-'.join([var,'libcpp']) + + host_config_path = "hc-%s-%s-%s%s-%s.cmake" % (socket.gethostname().rstrip('1234567890'), + self._get_sys_type(spec), + spec.compiler, + var, + spec.dag_hash()) + dest_dir = self.stage.source_path + host_config_path = os.path.abspath(pjoin(dest_dir, host_config_path)) + return host_config_path + + def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): + """ + This method creates a 'host-config' file that specifies + all of the options used to configure and build Umpire. + + For more details about 'host-config' files see: + http://software.llnl.gov/conduit/building.html + + Note: + The `py_site_pkgs_dir` arg exists to allow a package that + subclasses this package provide a specific site packages + dir when calling this function. `py_site_pkgs_dir` should + be an absolute path or `None`. + + This is necessary because the spack `site_packages_dir` + var will not exist in the base class. For more details + on this issue see: https://github.com/spack/spack/issues/6261 + """ + + ####################### + # Compiler Info + ####################### + c_compiler = env["SPACK_CC"] + cpp_compiler = env["SPACK_CXX"] + + # Even though we don't have fortran code in our project we sometimes + # use the Fortran compiler to determine which libstdc++ to use + f_compiler = "" + if "SPACK_FC" in env.keys(): + # even if this is set, it may not exist + # do one more sanity check + if os.path.isfile(env["SPACK_FC"]): + f_compiler = env["SPACK_FC"] + + ####################################################################### + # By directly fetching the names of the actual compilers we appear + # to doing something evil here, but this is necessary to create a + # 'host config' file that works outside of the spack install env. + ####################################################################### + + sys_type = self._get_sys_type(spec) + + ############################################## + # Find and record what CMake is used + ############################################## + + cmake_exe = spec['cmake'].command.path + cmake_exe = os.path.realpath(cmake_exe) + + host_config_path = self._get_host_config_path(spec) + cfg = open(host_config_path, "w") + cfg.write("###################\n".format("#" * 60)) + cfg.write("# Generated host-config - Edit at own risk!\n") + cfg.write("###################\n".format("#" * 60)) + cfg.write("# Copyright (c) 2020, Lawrence Livermore National Security, LLC and\n") + cfg.write("# other Umpire Project Developers. See the top-level LICENSE file for\n") + cfg.write("# details.\n") + cfg.write("#\n") + cfg.write("# SPDX-License-Identifier: (BSD-3-Clause) \n") + cfg.write("###################\n\n".format("#" * 60)) + + cfg.write("#------------------\n".format("-" * 60)) + cfg.write("# SYS_TYPE: {0}\n".format(sys_type)) + cfg.write("# Compiler Spec: {0}\n".format(spec.compiler)) + cfg.write("# CMake executable path: %s\n" % cmake_exe) + cfg.write("#------------------\n\n".format("-" * 60)) + + cfg.write(cmake_cache_string("CMAKE_BUILD_TYPE", spec.variants['build_type'].value)) + + ####################### + # Compiler Settings + ####################### + + cfg.write("#------------------\n".format("-" * 60)) + cfg.write("# Compilers\n") + cfg.write("#------------------\n\n".format("-" * 60)) + cfg.write(cmake_cache_entry("CMAKE_C_COMPILER", c_compiler)) + cfg.write(cmake_cache_entry("CMAKE_CXX_COMPILER", cpp_compiler)) + + # use global spack compiler flags + cflags = ' '.join(spec.compiler_flags['cflags']) + if "+libcpp" in spec: + cflags += ' '.join([cflags,"-DGTEST_HAS_CXXABI_H_=0"]) + if cflags: + cfg.write(cmake_cache_entry("CMAKE_C_FLAGS", cflags)) + + cxxflags = ' '.join(spec.compiler_flags['cxxflags']) + if "+libcpp" in spec: + cxxflags += ' '.join([cxxflags,"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0"]) + if cxxflags: + cfg.write(cmake_cache_entry("CMAKE_CXX_FLAGS", cxxflags)) + + # TODO (bernede1@llnl.gov): Is this useful for RAJA? + if ("gfortran" in f_compiler) and ("clang" in cpp_compiler): + libdir = pjoin(os.path.dirname( + os.path.dirname(f_compiler)), "lib") + flags = "" + for _libpath in [libdir, libdir + "64"]: + if os.path.exists(_libpath): + flags += " -Wl,-rpath,{0}".format(_libpath) + description = ("Adds a missing libstdc++ rpath") + #if flags: + # cfg.write(cmake_cache_string("BLT_EXE_LINKER_FLAGS", flags, + # description)) + + gcc_toolchain_regex = re.compile("--gcc-toolchain=(.*)") + gcc_name_regex = re.compile(".*gcc-name.*") + + using_toolchain = list(filter(gcc_toolchain_regex.match, spec.compiler_flags['cxxflags'])) + if(using_toolchain): + gcc_toolchain_path = gcc_toolchain_regex.match(using_toolchain[0]) + using_gcc_name = list(filter(gcc_name_regex.match, spec.compiler_flags['cxxflags'])) + compilers_using_toolchain = ["pgi", "xl", "icpc"] + if any(compiler in cpp_compiler for compiler in compilers_using_toolchain): + if using_toolchain or using_gcc_name: + cfg.write(cmake_cache_entry("BLT_CMAKE_IMPLICIT_LINK_DIRECTORIES_EXCLUDE", + "/usr/tce/packages/gcc/gcc-4.9.3/lib64;/usr/tce/packages/gcc/gcc-4.9.3/gnu/lib64/gcc/powerpc64le-unknown-linux-gnu/4.9.3;/usr/tce/packages/gcc/gcc-4.9.3/gnu/lib64;/usr/tce/packages/gcc/gcc-4.9.3/lib64/gcc/x86_64-unknown-linux-gnu/4.9.3")) + + compilers_using_cxx14 = ["intel-17", "intel-18", "xl"] + if any(compiler in cpp_compiler for compiler in compilers_using_cxx14): + cfg.write(cmake_cache_entry("BLT_CXX_STD", "c++14")) + + if "+cuda" in spec: + cfg.write("#------------------{0}\n".format("-" * 60)) + cfg.write("# Cuda\n") + cfg.write("#------------------{0}\n\n".format("-" * 60)) + + cfg.write(cmake_cache_option("ENABLE_CUDA", True)) + + cudatoolkitdir = spec['cuda'].prefix + cfg.write(cmake_cache_entry("CUDA_TOOLKIT_ROOT_DIR", + cudatoolkitdir)) + cudacompiler = "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc" + cfg.write(cmake_cache_entry("CMAKE_CUDA_COMPILER", + cudacompiler)) + + if ("xl" in cpp_compiler): + cfg.write(cmake_cache_entry("CMAKE_CUDA_FLAGS", "-Xcompiler -O3 -Xcompiler -qxlcompatmacros -Xcompiler -qalias=noansi " + + "-Xcompiler -qsmp=omp -Xcompiler -qhot -Xcompiler -qnoeh -Xcompiler -qsuppress=1500-029 " + + "-Xcompiler -qsuppress=1500-036 -Xcompiler -qsuppress=1500-030")) + cuda_release_flags = "-O3" + cuda_reldebinf_flags = "-O3 -g" + cuda_debug_flags = "-O0 -g" + + cfg.write(cmake_cache_string("BLT_CXX_STD", "c++14")) + else: + cuda_release_flags = "-O3 -Xcompiler -Ofast -Xcompiler -finline-functions -Xcompiler -finline-limit=20000" + cuda_reldebinf_flags = "-O3 -g -Xcompiler -Ofast -Xcompiler -finline-functions -Xcompiler -finline-limit=20000" + cuda_debug_flags = "-O0 -g -Xcompiler -O0 -Xcompiler -finline-functions -Xcompiler -finline-limit=20000" + + cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELEASE", cuda_release_flags)) + cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELWITHDEBINFO", cuda_reldebinf_flags)) + cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_DEBUG", cuda_debug_flags)) + + if not spec.satisfies('cuda_arch=none'): + cuda_arch = spec.variants['cuda_arch'].value + cfg.write(cmake_cache_string("CUDA_ARCH", 'sm_{0}'.format(cuda_arch[0]))) + + else: + cfg.write(cmake_cache_option("ENABLE_CUDA", False)) + + if "+hip" in spec: + cfg.write("#------------------{0}\n".format("-" * 60)) + cfg.write("# HIP\n") + cfg.write("#------------------{0}\n\n".format("-" * 60)) + + cfg.write(cmake_cache_option("ENABLE_HIP", True)) + + hip_root = spec['hip'].prefix + rocm_root = hip_root + "/.." + cfg.write(cmake_cache_entry("HIP_ROOT_DIR", + hip_root)) + cfg.write(cmake_cache_entry("HIP_CLANG_PATH", + rocm_root + '/llvm/bin')) + cfg.write(cmake_cache_entry("HIP_HIPCC_FLAGS", + '--amdgpu-target=gfx906')) + cfg.write(cmake_cache_entry("HIP_RUNTIME_INCLUDE_DIRS", + "{0}/include;{0}/../hsa/include".format(hip_root))) + hip_link_flags = "-Wl,--disable-new-dtags -L{0}/lib -L{0}/../lib64 -L{0}/../lib -Wl,-rpath,{0}/lib:{0}/../lib:{0}/../lib64 -lamdhip64 -lhsakmt -lhsa-runtime64".format(hip_root) + if ('%gcc' in spec) or (using_toolchain): + if ('%gcc' in spec): + gcc_bin = os.path.dirname(self.compiler.cxx) + gcc_prefix = join_path(gcc_bin, '..') + else: + gcc_prefix = gcc_toolchain_path.group(1) + cfg.write(cmake_cache_entry("HIP_CLANG_FLAGS", + "--gcc-toolchain={0}".format(gcc_prefix))) + cfg.write(cmake_cache_entry("CMAKE_EXE_LINKER_FLAGS", + hip_link_flags + " -Wl,-rpath {}/lib64".format(gcc_prefix))) + else: + cfg.write(cmake_cache_entry("CMAKE_EXE_LINKER_FLAGS", hip_link_flags)) + + else: + cfg.write(cmake_cache_option("ENABLE_HIP", False)) + + cfg.write("#------------------{0}\n".format("-" * 60)) + cfg.write("# Other\n") + cfg.write("#------------------{0}\n\n".format("-" * 60)) + + cfg.write(cmake_cache_string("RAJA_RANGE_ALIGN", "4")) + cfg.write(cmake_cache_string("RAJA_RANGE_MIN_LENGTH", "32")) + cfg.write(cmake_cache_string("RAJA_DATA_ALIGN", "64")) + + cfg.write(cmake_cache_option("RAJA_HOST_CONFIG_LOADED", True)) + + # shared vs static libs + cfg.write(cmake_cache_option("BUILD_SHARED_LIBS","+shared" in spec)) + cfg.write(cmake_cache_option("ENABLE_OPENMP","+openmp" in spec)) + + # Note 1: Work around spack adding -march=ppc64le to SPACK_TARGET_ARGS + # which is used by the spack compiler wrapper. This can go away when + # BLT removes -Werror from GTest flags + # Note 2: Tests are either built if variant is set, or if run-tests + # option is passed. + if self.spec.satisfies('%clang target=ppc64le:'): + cfg.write(cmake_cache_option("ENABLE_TESTS",False)) + if 'tests=benchmarks' in spec or not 'tests=none' in spec: + print("MSG: no testing supported on %clang target=ppc64le:") + else: + cfg.write(cmake_cache_option("ENABLE_BENCHMARKS", 'tests=benchmarks' in spec)) + cfg.write(cmake_cache_option("ENABLE_TESTS", not 'tests=none' in spec or self.run_tests)) + + ####################### + # Close and save + ####################### + cfg.write("\n") + cfg.close() + + print("OUT: host-config file {0}".format(host_config_path)) + + def cmake_args(self): + spec = self.spec + host_config_path = self._get_host_config_path(spec) + + options = [] + options.extend(['-C', host_config_path]) + + return options diff --git a/scripts/uberenv/project.json b/scripts/uberenv/project.json new file mode 100644 index 000000000..1ea0482ce --- /dev/null +++ b/scripts/uberenv/project.json @@ -0,0 +1,10 @@ +{ +"package_name" : "raja", +"package_version" : "develop", +"package_final_phase" : "hostconfig", +"package_source_dir" : "../..", +"spack_url": "https://github.com/davidbeckingsale/spack", +"spack_branch": "feature/allow-untested-cuda-versions", +"spack_commit": "f96e256bee1948aa030916aae0c1b2645230fb9f", +"spack_activate" : {} +} diff --git a/scripts/uberenv/spack_configs b/scripts/uberenv/spack_configs new file mode 120000 index 000000000..17d3bf7a4 --- /dev/null +++ b/scripts/uberenv/spack_configs @@ -0,0 +1 @@ +../radiuss-spack-configs \ No newline at end of file diff --git a/scripts/uberenv/uberenv.py b/scripts/uberenv/uberenv.py new file mode 100755 index 000000000..7761be2e9 --- /dev/null +++ b/scripts/uberenv/uberenv.py @@ -0,0 +1,800 @@ +#!/bin/sh +"exec" "python" "-u" "-B" "$0" "$@" +############################################################################### +# Copyright (c) 2014-2020, Lawrence Livermore National Security, LLC. +# +# Produced at the Lawrence Livermore National Laboratory +# +# LLNL-CODE-666778 +# +# All rights reserved. +# +# This file is part of Conduit. +# +# For details, see https://lc.llnl.gov/conduit/. +# +# Please also read conduit/LICENSE +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the disclaimer below. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the disclaimer (as noted below) in the +# documentation and/or other materials provided with the distribution. +# +# * Neither the name of the LLNS/LLNL nor the names of its contributors may +# be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, +# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +############################################################################### + +""" + file: uberenv.py + + description: automates using spack to install a project. + +""" + +import os +import sys +import subprocess +import shutil +import socket +import platform +import json +import datetime +import glob +import re + +from optparse import OptionParser + +from os import environ as env +from os.path import join as pjoin + + +def sexe(cmd,ret_output=False,echo=False): + """ Helper for executing shell commands. """ + if echo: + print("[exe: {}]".format(cmd)) + if ret_output: + p = subprocess.Popen(cmd, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + out = p.communicate()[0] + out = out.decode('utf8') + return p.returncode,out + else: + return subprocess.call(cmd,shell=True) + + +def parse_args(): + "Parses args from command line" + parser = OptionParser() + parser.add_option("--install", + action="store_true", + dest="install", + default=False, + help="Install `package_name`, not just its dependencies.") + + # where to install + parser.add_option("--prefix", + dest="prefix", + default="uberenv_libs", + help="destination directory") + + # what compiler to use + parser.add_option("--spec", + dest="spec", + default=None, + help="spack compiler spec") + + # optional location of spack mirror + parser.add_option("--mirror", + dest="mirror", + default=None, + help="spack mirror directory") + + # flag to create mirror + parser.add_option("--create-mirror", + action="store_true", + dest="create_mirror", + default=False, + help="Create spack mirror") + + # optional location of spack upstream + parser.add_option("--upstream", + dest="upstream", + default=None, + help="add an external spack instance as upstream") + + # this option allows a user to explicitly to select a + # group of spack settings files (compilers.yaml , packages.yaml) + parser.add_option("--spack-config-dir", + dest="spack_config_dir", + default=None, + help="dir with spack settings files (compilers.yaml, packages.yaml, etc)") + + # overrides package_name + parser.add_option("--package-name", + dest="package_name", + default=None, + help="override the default package name") + + # controls after which package phase spack should stop + parser.add_option("--package-final-phase", + dest="package_final_phase", + default=None, + help="override the default phase after which spack should stop") + + # controls source_dir spack should use to build the package + parser.add_option("--package-source-dir", + dest="package_source_dir", + default=None, + help="override the default source dir spack should use") + + # a file that holds settings for a specific project + # using uberenv.py + parser.add_option("--project-json", + dest="project_json", + default=pjoin(uberenv_script_dir(),"project.json"), + help="uberenv project settings json file") + + # flag to use insecure curl + git + parser.add_option("-k", + action="store_true", + dest="ignore_ssl_errors", + default=False, + help="Ignore SSL Errors") + + # option to force a spack pull + parser.add_option("--pull", + action="store_true", + dest="spack_pull", + default=False, + help="Pull if spack repo already exists") + + # option to force for clean of packages specified to + # be cleaned in the project.json + parser.add_option("--clean", + action="store_true", + dest="spack_clean", + default=False, + help="Force uninstall of packages specified in project.json") + + # option to tell spack to run tests + parser.add_option("--run_tests", + action="store_true", + dest="run_tests", + default=False, + help="Invoke build tests during spack install") + + # option to init osx sdk env flags + parser.add_option("--macos-sdk-env-setup", + action="store_true", + dest="macos_sdk_env_setup", + default=False, + help="Set several env vars to select OSX SDK settings." + "This was necessary for older versions of macOS " + " but can cause issues with macOS versions >= 10.13. " + " so it is disabled by default.") + + + ############### + # parse args + ############### + opts, extras = parser.parse_args() + # we want a dict b/c the values could + # be passed without using optparse + opts = vars(opts) + if not opts["spack_config_dir"] is None: + opts["spack_config_dir"] = os.path.abspath(opts["spack_config_dir"]) + if not os.path.isdir(opts["spack_config_dir"]): + print("[ERROR: invalid spack config dir: {} ]".format(opts["spack_config_dir"])) + sys.exit(-1) + # if rel path is given for the mirror, we need to evaluate here -- before any + # chdirs to avoid confusion related to what it is relative to. + # (it should be relative to where uberenv is run from, so it matches what you expect + # from shell completion, etc) + if not opts["mirror"] is None: + if not opts["mirror"].startswith("http") and not os.path.isabs(opts["mirror"]): + opts["mirror"] = os.path.abspath(opts["mirror"]) + return opts, extras + + +def uberenv_script_dir(): + # returns the directory of the uberenv.py script + return os.path.dirname(os.path.abspath(__file__)) + +def load_json_file(json_file): + # reads json file + return json.load(open(json_file)) + +def is_darwin(): + return "darwin" in platform.system().lower() + +def is_windows(): + return "windows" in platform.system().lower() + +class UberEnv(): + """ Base class for package manager """ + + def __init__(self, opts, extra_opts): + self.opts = opts + self.extra_opts = extra_opts + + # load project settings + self.project_opts = load_json_file(opts["project_json"]) + print("[uberenv project settings: {}]".format(str(self.project_opts))) + print("[uberenv options: {}]".format(str(self.opts))) + + def setup_paths_and_dirs(self): + self.uberenv_path = os.path.dirname(os.path.realpath(__file__)) + + def set_from_args_or_json(self,setting): + try: + setting_value = self.project_opts[setting] + except (KeyError): + print("ERROR: {} must at least be defined in project.json".format(setting)) + raise + else: + if self.opts[setting]: + setting_value = self.opts[setting] + return setting_value + + def set_from_json(self,setting): + try: + setting_value = self.project_opts[setting] + except (KeyError): + print("ERROR: {} must at least be defined in project.json".format(setting)) + raise + return setting_value + + def detect_platform(self): + # find supported sets of compilers.yaml, packages,yaml + res = None + if is_darwin(): + res = "darwin" + elif "SYS_TYPE" in os.environ.keys(): + sys_type = os.environ["SYS_TYPE"].lower() + res = sys_type + return res + + +class SpackEnv(UberEnv): + """ Helper to clone spack and install libraries on MacOS an Linux """ + + def __init__(self, opts, extra_opts): + UberEnv.__init__(self,opts,extra_opts) + + self.pkg_name = self.set_from_args_or_json("package_name") + self.pkg_version = self.set_from_json("package_version") + self.pkg_final_phase = self.set_from_args_or_json("package_final_phase") + self.pkg_src_dir = self.set_from_args_or_json("package_source_dir") + + self.spec_hash = "" + self.use_install = False + + # Some additional setup for macos + if is_darwin(): + if opts["macos_sdk_env_setup"]: + # setup osx deployment target and sdk settings + setup_osx_sdk_env_vars() + else: + print("[skipping MACOSX env var setup]") + + # setup default spec + if opts["spec"] is None: + if is_darwin(): + opts["spec"] = "%clang" + else: + opts["spec"] = "%gcc" + self.opts["spec"] = "@{}{}".format(self.pkg_version,opts["spec"]) + elif not opts["spec"].startswith("@"): + self.opts["spec"] = "@{}{}".format(self.pkg_version,opts["spec"]) + else: + self.opts["spec"] = "{}".format(opts["spec"]) + + print("[spack spec: {}]".format(self.opts["spec"])) + + def setup_paths_and_dirs(self): + # get the current working path, and the glob used to identify the + # package files we want to hot-copy to spack + + UberEnv.setup_paths_and_dirs(self) + + self.pkgs = pjoin(self.uberenv_path, "packages","*") + + # setup destination paths + self.dest_dir = os.path.abspath(self.opts["prefix"]) + self.dest_spack = pjoin(self.dest_dir,"spack") + print("[installing to: {0}]".format(self.dest_dir)) + + # print a warning if the dest path already exists + if not os.path.isdir(self.dest_dir): + os.mkdir(self.dest_dir) + else: + print("[info: destination '{}' already exists]".format(self.dest_dir)) + + if os.path.isdir(self.dest_spack): + print("[info: destination '{}' already exists]".format(self.dest_spack)) + + self.pkg_src_dir = os.path.join(self.uberenv_path,self.pkg_src_dir) + if not os.path.isdir(self.pkg_src_dir): + print("[ERROR: package_source_dir '{}' does not exist]".format(self.pkg_src_dir)) + sys.exit(-1) + + + def find_spack_pkg_path_from_hash(self, pkg_name, pkg_hash): + res, out = sexe("spack/bin/spack find -p /{}".format(pkg_hash), ret_output = True) + for l in out.split("\n"): + if l.startswith(pkg_name): + return {"name": pkg_name, "path": l.split()[-1]} + print("[ERROR: failed to find package named '{}']".format(pkg_name)) + sys.exit(-1) + + def find_spack_pkg_path(self, pkg_name, spec = ""): + res, out = sexe("spack/bin/spack find -p " + pkg_name + spec,ret_output = True) + for l in out.split("\n"): + # TODO: at least print a warning when several choices exist. This will + # pick the first in the list. + if l.startswith(pkg_name): + return {"name": pkg_name, "path": l.split()[-1]} + print("[ERROR: failed to find package named '{}']".format(pkg_name)) + sys.exit(-1) + + # Extract the first line of the full spec + def read_spack_full_spec(self,pkg_name,spec): + res, out = sexe("spack/bin/spack spec " + pkg_name + " " + spec, ret_output=True) + for l in out.split("\n"): + if l.startswith(pkg_name) and l.count("@") > 0 and l.count("arch=") > 0: + return l.strip() + + def clone_repo(self): + if not os.path.isdir(self.dest_spack): + + # compose clone command for the dest path, spack url and branch + print("[info: cloning spack develop branch from github]") + + os.chdir(self.dest_dir) + + clone_opts = ("-c http.sslVerify=false " + if self.opts["ignore_ssl_errors"] else "") + + spack_url = self.project_opts.get("spack_url", "https://github.com/spack/spack.git") + spack_branch = self.project_opts.get("spack_branch", "develop") + + clone_cmd = "git {0} clone --single-branch --depth=1 -b {1} {2}".format(clone_opts, spack_branch,spack_url) + sexe(clone_cmd, echo=True) + + if "spack_commit" in self.project_opts: + # optionally, check out a specific commit + os.chdir(pjoin(self.dest_dir,"spack")) + sha1 = self.project_opts["spack_commit"] + res, current_sha1 = sexe("git log -1 --pretty=%H", ret_output=True) + if sha1 != current_sha1: + print("[info: using spack commit {}]".format(sha1)) + sexe("git stash", echo=True) + sexe("git fetch --depth=1 origin {0}".format(sha1),echo=True) + sexe("git checkout {0}".format(sha1),echo=True) + + if self.opts["spack_pull"]: + # do a pull to make sure we have the latest + os.chdir(pjoin(self.dest_dir,"spack")) + sexe("git stash", echo=True) + sexe("git pull", echo=True) + + def config_dir(self): + """ path to compilers.yaml, which we will use for spack's compiler setup""" + spack_config_dir = self.opts["spack_config_dir"] + if spack_config_dir is None: + uberenv_plat = self.detect_platform() + if not uberenv_plat is None: + spack_config_dir = os.path.abspath(pjoin(self.uberenv_path,"spack_configs",uberenv_plat)) + return spack_config_dir + + + def disable_spack_config_scopes(self,spack_dir): + # disables all config scopes except "defaults", which we will + # force our settings into + spack_lib_config = pjoin(spack_dir,"lib","spack","spack","config.py") + print("[disabling config scope (except defaults) in: {}]".format(spack_lib_config)) + cfg_script = open(spack_lib_config).read() + for cfg_scope_stmt in ["('system', os.path.join(spack.paths.system_etc_path, 'spack')),", + "('site', os.path.join(spack.paths.etc_path, 'spack')),", + "('user', spack.paths.user_config_path)"]: + cfg_script = cfg_script.replace(cfg_scope_stmt, + "#DISABLED BY UBERENV: " + cfg_scope_stmt) + open(spack_lib_config,"w").write(cfg_script) + + + def patch(self): + + cfg_dir = self.config_dir() + spack_dir = self.dest_spack + + # force spack to use only "defaults" config scope + self.disable_spack_config_scopes(spack_dir) + spack_etc_defaults_dir = pjoin(spack_dir,"etc","spack","defaults") + + # copy in "defaults" config.yaml + config_yaml = os.path.abspath(pjoin(self.uberenv_path,"spack_configs","config.yaml")) + sexe("cp {} {}/".format(config_yaml, spack_etc_defaults_dir ), echo=True) + + # copy in other settings per platform + if not cfg_dir is None: + print("[copying uberenv compiler and packages settings from {0}]".format(cfg_dir)) + + config_yaml = pjoin(cfg_dir,"config.yaml") + compilers_yaml = pjoin(cfg_dir,"compilers.yaml") + packages_yaml = pjoin(cfg_dir,"packages.yaml") + + if os.path.isfile(config_yaml): + sexe("cp {} {}/".format(config_yaml , spack_etc_defaults_dir ), echo=True) + + if os.path.isfile(compilers_yaml): + sexe("cp {} {}/".format(compilers_yaml, spack_etc_defaults_dir ), echo=True) + + if os.path.isfile(packages_yaml): + sexe("cp {} {}/".format(packages_yaml, spack_etc_defaults_dir ), echo=True) + else: + # let spack try to auto find compilers + sexe("spack/bin/spack compiler find", echo=True) + + # hot-copy our packages into spack + if self.pkgs: + dest_spack_pkgs = pjoin(spack_dir,"var","spack","repos","builtin","packages") + print("[copying patched packages from {0}]".format(self.pkgs)) + sexe("cp -Rf {} {}".format(self.pkgs,dest_spack_pkgs)) + + + def clean_build(self): + # clean out any temporary spack build stages + cln_cmd = "spack/bin/spack clean " + res = sexe(cln_cmd, echo=True) + + # clean out any spack cached stuff + cln_cmd = "spack/bin/spack clean --all" + res = sexe(cln_cmd, echo=True) + + # check if we need to force uninstall of selected packages + if self.opts["spack_clean"]: + if self.project_opts.has_key("spack_clean_packages"): + for cln_pkg in self.project_opts["spack_clean_packages"]: + if not self.find_spack_pkg_path(cln_pkg) is None: + unist_cmd = "spack/bin/spack uninstall -f -y --all --dependents " + cln_pkg + res = sexe(unist_cmd, echo=True) + + def show_info(self): + # prints install status and 32 characters hash + options="--install-status --very-long" + spec_cmd = "spack/bin/spack spec {0} {1}{2}".format(options,self.pkg_name,self.opts["spec"]) + + res, out = sexe(spec_cmd, ret_output=True, echo=True) + print(out) + + #Check if spec is already installed + for line in out.split("\n"): + # Example of matching line: ("status" "hash" "package"...) + # [+] hf3cubkgl74ryc3qwen73kl4yfh2ijgd serac@develop%clang@10.0.0-apple~debug~devtools~glvis arch=darwin-mojave-x86_64 + if re.match(r"^(\[\+\]| - ) [a-z0-9]{32} " + re.escape(self.pkg_name), line): + self.spec_hash = line.split(" ")[1] + # if spec already installed + if line.startswith("[+]"): + pkg_path = self.find_spack_pkg_path_from_hash(self.pkg_name,self.spec_hash) + install_path = pkg_path["path"] + # testing that the path exists is mandatory until Spack team fixes + # https://github.com/spack/spack/issues/16329 + if os.path.isdir(install_path): + print("[Warning: {} {} has already been installed in {}]".format(self.pkg_name, self.opts["spec"],install_path)) + print("[Warning: Uberenv will proceed using this directory]".format(self.pkg_name)) + self.use_install = True + + return res + + def install(self): + # use the uberenv package to trigger the right builds + # and build an host-config.cmake file + + if not self.use_install: + install_cmd = "spack/bin/spack " + if self.opts["ignore_ssl_errors"]: + install_cmd += "-k " + if not self.opts["install"]: + install_cmd += "dev-build --quiet -d {} -u {} ".format(self.pkg_src_dir,self.pkg_final_phase) + else: + install_cmd += "install " + if self.opts["run_tests"]: + install_cmd += "--test=root " + install_cmd += self.pkg_name + self.opts["spec"] + res = sexe(install_cmd, echo=True) + + if res != 0: + print("[ERROR: failure of spack install/dev-build]") + return res + + full_spec = self.read_spack_full_spec(self.pkg_name,self.opts["spec"]) + if "spack_activate" in self.project_opts: + print("[activating dependent packages]") + # get the full spack spec for our project + pkg_names = self.project_opts["spack_activate"].keys() + for pkg_name in pkg_names: + pkg_spec_requirements = self.project_opts["spack_activate"][pkg_name] + activate=True + for req in pkg_spec_requirements: + if req not in full_spec: + activate=False + break + if activate: + activate_cmd = "spack/bin/spack activate " + pkg_name + sexe(activate_cmd, echo=True) + # note: this assumes package extends python when +python + # this may fail general cases + if self.opts["install"] and "+python" in full_spec: + activate_cmd = "spack/bin/spack activate /" + self.spec_hash + sexe(activate_cmd, echo=True) + # if user opt'd for an install, we want to symlink the final + # install to an easy place: + if self.opts["install"] or self.use_install: + pkg_path = self.find_spack_pkg_path_from_hash(self.pkg_name, self.spec_hash) + if self.pkg_name != pkg_path["name"]: + print("[ERROR: Could not find install of {}]".format(self.pkg_name)) + return -1 + else: + # Symlink host-config file + hc_glob = glob.glob(pjoin(pkg_path["path"],"*.cmake")) + if len(hc_glob) > 0: + hc_path = hc_glob[0] + hc_fname = os.path.split(hc_path)[1] + if os.path.islink(hc_fname): + os.unlink(hc_fname) + elif os.path.isfile(hc_fname): + sexe("rm -f {}".format(hc_fname)) + print("[symlinking host config file to {}]".format(pjoin(self.dest_dir,hc_fname))) + os.symlink(hc_path,hc_fname) + + # Symlink install directory + if self.opts["install"]: + pkg_lnk_dir = "{}-install".format(self.pkg_name) + if os.path.islink(pkg_lnk_dir): + os.unlink(pkg_lnk_dir) + print("") + print("[symlinking install to {}]".format(pjoin(self.dest_dir,pkg_lnk_dir))) + os.symlink(pkg_path["path"],os.path.abspath(pkg_lnk_dir)) + print("") + print("[install complete!]") + # otherwise we are in the "only dependencies" case and the host-config + # file has to be copied from the do-be-deleted spack-build dir. + else: + pattern = "*{}.cmake".format(self.pkg_name) + build_dir = pjoin(self.pkg_src_dir,"spack-build") + hc_glob = glob.glob(pjoin(build_dir,pattern)) + if len(hc_glob) > 0: + hc_path = hc_glob[0] + hc_fname = os.path.split(hc_path)[1] + if os.path.islink(hc_fname): + os.unlink(hc_fname) + print("[copying host config file to {}]".format(pjoin(self.dest_dir,hc_fname))) + sexe("cp {} {}".format(hc_path,hc_fname)) + print("[removing project build directory {}]".format(pjoin(build_dir))) + sexe("rm -rf {}".format(build_dir)) + + def get_mirror_path(self): + mirror_path = self.opts["mirror"] + if not mirror_path: + print("[--create-mirror requires a mirror directory]") + sys.exit(-1) + return mirror_path + + def create_mirror(self): + """ + Creates a spack mirror for pkg_name at mirror_path. + """ + + mirror_path = self.get_mirror_path() + + mirror_cmd = "spack/bin/spack " + if self.opts["ignore_ssl_errors"]: + mirror_cmd += "-k " + mirror_cmd += "mirror create -d {} --dependencies {}{}".format(mirror_path, + self.pkg_name, + self.opts["spec"]) + return sexe(mirror_cmd, echo=True) + + def find_spack_mirror(self, mirror_name): + """ + Returns the path of a defaults scoped spack mirror with the + given name, or None if no mirror exists. + """ + res, out = sexe("spack/bin/spack mirror list", ret_output=True) + mirror_path = None + for mirror in out.split('\n'): + if mirror: + parts = mirror.split() + if parts[0] == mirror_name: + mirror_path = parts[1] + return mirror_path + + def use_mirror(self): + """ + Configures spack to use mirror at a given path. + """ + mirror_name = self.pkg_name + mirror_path = self.get_mirror_path() + existing_mirror_path = self.find_spack_mirror(mirror_name) + + if existing_mirror_path and mirror_path != existing_mirror_path: + # Existing mirror has different URL, error out + print("[removing existing spack mirror `{}` @ {}]".format(mirror_name, + existing_mirror_path)) + # + # Note: In this case, spack says it removes the mirror, but we still + # get errors when we try to add a new one, sounds like a bug + # + sexe("spack/bin/spack mirror remove --scope=defaults {} ".format(mirror_name), + echo=True) + existing_mirror_path = None + if not existing_mirror_path: + # Add if not already there + sexe("spack/bin/spack mirror add --scope=defaults {} {}".format( + mirror_name, mirror_path), echo=True) + print("[using mirror {}]".format(mirror_path)) + + def find_spack_upstream(self, upstream_name): + """ + Returns the path of a defaults scoped spack upstream with the + given name, or None if no upstream exists. + """ + upstream_path = None + + res, out = sexe('spack/bin/spack config get upstreams', ret_output=True) + if (not out) and ("upstreams:" in out): + out = out.replace(' ', '') + out = out.replace('install_tree:', '') + out = out.replace(':', '') + out = out.splitlines() + out = out[1:] + upstreams = dict(zip(out[::2], out[1::2])) + + for name in upstreams.keys(): + if name == upstream_name: + upstream_path = upstreams[name] + + return upstream_path + + def use_spack_upstream(self): + """ + Configures spack to use upstream at a given path. + """ + upstream_path = self.opts["upstream"] + if not upstream_path: + print("[--create-upstream requires a upstream directory]") + sys.exit(-1) + upstream_path = os.path.abspath(upstream_path) + upstream_name = self.pkg_name + existing_upstream_path = self.find_spack_upstream(upstream_name) + if (not existing_upstream_path) or (upstream_path != os.path.abspath(existing_upstream_path)): + # Existing upstream has different URL, error out + print("[removing existing spack upstream configuration file]") + sexe("rm spack/etc/spack/defaults/upstreams.yaml") + with open('spack/etc/spack/defaults/upstreams.yaml','w+') as upstreams_cfg_file: + upstreams_cfg_file.write("upstreams:\n") + upstreams_cfg_file.write(" {}:\n".format(upstream_name)) + upstreams_cfg_file.write(" install_tree: {}\n".format(upstream_path)) + + +def find_osx_sdks(): + """ + Finds installed osx sdks, returns dict mapping version to file system path + """ + res = {} + sdks = glob.glob("/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX*.sdk") + for sdk in sdks: + sdk_base = os.path.split(sdk)[1] + ver = sdk_base[len("MacOSX"):sdk_base.rfind(".")] + res[ver] = sdk + return res + +def setup_osx_sdk_env_vars(): + """ + Finds installed osx sdks, returns dict mapping version to file system path + """ + # find current osx version (10.11.6) + dep_tgt = platform.mac_ver()[0] + # sdk file names use short version (ex: 10.11) + dep_tgt_short = dep_tgt[:dep_tgt.rfind(".")] + # find installed sdks, ideally we want the sdk that matches the current os + sdk_root = None + sdks = find_osx_sdks() + if dep_tgt_short in sdks.keys(): + # matches our osx, use this one + sdk_root = sdks[dep_tgt_short] + elif len(sdks) > 0: + # for now, choose first one: + dep_tgt = sdks.keys()[0] + sdk_root = sdks[dep_tgt] + else: + # no valid sdks, error out + print("[ERROR: Could not find OSX SDK @ /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/]") + sys.exit(-1) + + env["MACOSX_DEPLOYMENT_TARGET"] = dep_tgt + env["SDKROOT"] = sdk_root + print("[setting MACOSX_DEPLOYMENT_TARGET to {}]".format(env["MACOSX_DEPLOYMENT_TARGET"])) + print("[setting SDKROOT to {}]".format(env[ "SDKROOT"])) + + + +def main(): + """ + Clones and runs a package manager to setup third_party libs. + Also creates a host-config.cmake file that can be used by our project. + """ + + # parse args from command line + opts, extra_opts = parse_args() + + # Initialize the environment + env = SpackEnv(opts, extra_opts) + + # Setup the necessary paths and directories + env.setup_paths_and_dirs() + + # Clone the package manager + env.clone_repo() + + os.chdir(env.dest_dir) + + # Patch the package manager, as necessary + env.patch() + + # Clean the build + env.clean_build() + + # Show the spec for what will be built + env.show_info() + + + ########################################################## + # we now have an instance of spack configured how we + # need it to build our tpls at this point there are two + # possible next steps: + # + # *) create a mirror of the packages + # OR + # *) build + # + ########################################################## + if opts["create_mirror"]: + return env.create_mirror() + else: + if not opts["mirror"] is None: + env.use_mirror() + + if not opts["upstream"] is None: + env.use_spack_upstream() + + res = env.install() + + return res + +if __name__ == "__main__": + sys.exit(main()) + + From 58154b1a5eee038b323a62ba69bd5bbdc7a5ba63 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Mon, 19 Jul 2021 14:57:25 -0700 Subject: [PATCH 026/392] adding a PR template note --- .github/PULL_REQUEST_TEMPLATE.md | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 .github/PULL_REQUEST_TEMPLATE.md diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 000000000..d864de65f --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,10 @@ +# Summary (Write a short headline summary of PR) + +- This PR is a (refactoring, bugfix, feature, something else) +- It does the following (modify list as needed): + - Modifies/refactors (class or method) (how?) + - Fixes (issue number(s)) + - Adds (specific feature) at the request of (project or person) + +*IMPORTANT NOTE! Remember to comment "LGTM" after pushing a commit to trigger the Gitlab CI. +Otherwise the CI will not run and the PR will never passed all the required tests!* From a14974593fef46ad51ae9d6117f4187a2bbd7398 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Mon, 19 Jul 2021 15:44:03 -0700 Subject: [PATCH 027/392] making radiuss and uberenv submodules, adding spack_packages --- .gitmodules | 5 +- scripts/radiuss-spack-configs | 1 + scripts/radiuss-spack-configs/COPYRIGHT | 16 - scripts/radiuss-spack-configs/LICENSE | 21 - scripts/radiuss-spack-configs/NOTICE | 21 - scripts/radiuss-spack-configs/README.md | 47 - .../blueos_3_ppc64le_ib/compilers.yaml | 209 ----- .../blueos_3_ppc64le_ib/packages.yaml | 55 -- .../blueos_3_ppc64le_ib_p9 | 1 - scripts/radiuss-spack-configs/config.yaml | 80 -- .../darwin/compilers.yaml | 65 -- .../darwin/packages.yaml | 25 - .../toss_3_x86_64_ib/compilers.yaml | 290 ------- .../toss_3_x86_64_ib/packages.yaml | 90 -- .../raja_perf}/package.py | 23 +- scripts/uberenv | 1 + scripts/uberenv/LICENSE | 64 -- scripts/uberenv/Makefile | 6 - scripts/uberenv/README.md | 19 - scripts/uberenv/docs/sphinx/conf.py | 324 ------- scripts/uberenv/docs/sphinx/index.rst | 194 ----- scripts/uberenv/gen_spack_env_script.py | 128 --- scripts/uberenv/packages/chai/package.py | 243 ------ scripts/uberenv/packages/hip/package.py | 54 -- scripts/uberenv/project.json | 10 - scripts/uberenv/spack_configs | 1 - scripts/uberenv/uberenv.py | 800 ------------------ 27 files changed, 25 insertions(+), 2768 deletions(-) create mode 160000 scripts/radiuss-spack-configs delete mode 100644 scripts/radiuss-spack-configs/COPYRIGHT delete mode 100644 scripts/radiuss-spack-configs/LICENSE delete mode 100644 scripts/radiuss-spack-configs/NOTICE delete mode 100644 scripts/radiuss-spack-configs/README.md delete mode 100644 scripts/radiuss-spack-configs/blueos_3_ppc64le_ib/compilers.yaml delete mode 100644 scripts/radiuss-spack-configs/blueos_3_ppc64le_ib/packages.yaml delete mode 120000 scripts/radiuss-spack-configs/blueos_3_ppc64le_ib_p9 delete mode 100644 scripts/radiuss-spack-configs/config.yaml delete mode 100644 scripts/radiuss-spack-configs/darwin/compilers.yaml delete mode 100644 scripts/radiuss-spack-configs/darwin/packages.yaml delete mode 100644 scripts/radiuss-spack-configs/toss_3_x86_64_ib/compilers.yaml delete mode 100644 scripts/radiuss-spack-configs/toss_3_x86_64_ib/packages.yaml rename scripts/{uberenv/packages/raja_perfsuite => spack_packages/raja_perf}/package.py (93%) create mode 160000 scripts/uberenv delete mode 100644 scripts/uberenv/LICENSE delete mode 100644 scripts/uberenv/Makefile delete mode 100644 scripts/uberenv/README.md delete mode 100644 scripts/uberenv/docs/sphinx/conf.py delete mode 100644 scripts/uberenv/docs/sphinx/index.rst delete mode 100644 scripts/uberenv/gen_spack_env_script.py delete mode 100644 scripts/uberenv/packages/chai/package.py delete mode 100644 scripts/uberenv/packages/hip/package.py delete mode 100644 scripts/uberenv/project.json delete mode 120000 scripts/uberenv/spack_configs delete mode 100755 scripts/uberenv/uberenv.py diff --git a/.gitmodules b/.gitmodules index babe9cd39..047c42160 100644 --- a/.gitmodules +++ b/.gitmodules @@ -6,4 +6,7 @@ url = https://github.com/LLNL/RAJA.git [submodule "scripts/radiuss-spack-configs"] path = scripts/radiuss-spack-configs - url = https://github.com/LLNL/radiuss-spack-configs + url = https://github.com/LLNL/radiuss-spack-configs.git +[submodule "scripts/uberenv"] + path = scripts/uberenv + url = https://github.com/LLNL/uberenv.git diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs new file mode 160000 index 000000000..292b30f98 --- /dev/null +++ b/scripts/radiuss-spack-configs @@ -0,0 +1 @@ +Subproject commit 292b30f981d325bbbba069d552bf4febdfdce938 diff --git a/scripts/radiuss-spack-configs/COPYRIGHT b/scripts/radiuss-spack-configs/COPYRIGHT deleted file mode 100644 index 627879f05..000000000 --- a/scripts/radiuss-spack-configs/COPYRIGHT +++ /dev/null @@ -1,16 +0,0 @@ -Intellectual Property Notice ------------------------------- - -RADIUSS Spack Config is licensed under the MIT license (LICENSE). - -Copyrights and patents in the RADIUSS Spack Config project are retained by -contributors. No copyright assignment is required to contribute to RADIUSS -Spack Config. - - -SPDX usage ------------- - -Individual files contain SPDX tags instead of the full license text. -This enables machine processing of license information based on the SPDX -License Identifiers that are available here: https://spdx.org/licenses/ diff --git a/scripts/radiuss-spack-configs/LICENSE b/scripts/radiuss-spack-configs/LICENSE deleted file mode 100644 index 3af858be7..000000000 --- a/scripts/radiuss-spack-configs/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2018, Lawrence Livermore National Security, LLC - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/scripts/radiuss-spack-configs/NOTICE b/scripts/radiuss-spack-configs/NOTICE deleted file mode 100644 index 3737d5a86..000000000 --- a/scripts/radiuss-spack-configs/NOTICE +++ /dev/null @@ -1,21 +0,0 @@ -This work was produced under the auspices of the U.S. Department of -Energy by Lawrence Livermore National Laboratory under Contract -DE-AC52-07NA27344. - -This work was prepared as an account of work sponsored by an agency of -the United States Government. Neither the United States Government nor -Lawrence Livermore National Security, LLC, nor any of their employees -makes any warranty, expressed or implied, or assumes any legal liability -or responsibility for the accuracy, completeness, or usefulness of any -information, apparatus, product, or process disclosed, or represents that -its use would not infringe privately owned rights. - -Reference herein to any specific commercial product, process, or service -by trade name, trademark, manufacturer, or otherwise does not necessarily -constitute or imply its endorsement, recommendation, or favoring by the -United States Government or Lawrence Livermore National Security, LLC. - -The views and opinions of authors expressed herein do not necessarily -state or reflect those of the United States Government or Lawrence -Livermore National Security, LLC, and shall not be used for advertising -or product endorsement purposes. diff --git a/scripts/radiuss-spack-configs/README.md b/scripts/radiuss-spack-configs/README.md deleted file mode 100644 index 82df80706..000000000 --- a/scripts/radiuss-spack-configs/README.md +++ /dev/null @@ -1,47 +0,0 @@ -# RADIUSS Spack Config - -The RADIUSS project promotes and supports key High Performance Computing (HPC) open-source software developed at the LLNL. These tools and libraries cover a wide range of features a team would need to develop a modern simulation code targeting HPC plaftorms. - -Radiuss Spack Config allows project to share a set of compilers and packages configurations for several machines. - -## Getting Started - -This project may be used as a submodule. - -### Installing - -This project requires no installation. - -## Contributing - -Please read [CONTRIBUTING.md](https://github.com/LLNL/radiuss-ci/CONTRIBUTING.md) for details on our code of conduct, and the process for submitting pull requests to us. - -## Versioning - -version: 1.0.0 - -TODO: Not even sure how to handle versioning here. - -## Authors - -Adrien M Bernede - -See also the list of [contributors](https://github.com/LLNL/radiuss-ci/contributors) who participated in this project. - -## License - -This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details - -All new contributions must be made under the MIT License. - -See [LICENSE](https://github.com/LLNL/radiuss-ci/blob/master/LICENSE), -[COPYRIGHT](https://github.com/LLNL/radiuss-ci/blob/master/COPYRIGHT), and -[NOTICE](https://github.com/LLNL/radiuss-ci/blob/master/NOTICE) for details. - -SPDX-License-Identifier: (MIT) - -LLNL-CODE-793462 - -## Acknowledgments - - diff --git a/scripts/radiuss-spack-configs/blueos_3_ppc64le_ib/compilers.yaml b/scripts/radiuss-spack-configs/blueos_3_ppc64le_ib/compilers.yaml deleted file mode 100644 index 7b8a2a4e1..000000000 --- a/scripts/radiuss-spack-configs/blueos_3_ppc64le_ib/compilers.yaml +++ /dev/null @@ -1,209 +0,0 @@ -compilers: -- compiler: - spec: clang@3.9.1 - paths: - cc: /usr/tcetmp/packages/clang/clang-3.9.1/bin/clang - cxx: /usr/tcetmp/packages/clang/clang-3.9.1/bin/clang++ - f77: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r - fc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r - flags: {} - operating_system: rhel7 - target: ppc64le - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: clang@4.0.0 - paths: - cc: /usr/tcetmp/packages/clang/clang-4.0.0/bin/clang - cxx: /usr/tcetmp/packages/clang/clang-4.0.0/bin/clang++ - f77: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r - fc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r - flags: {} - operating_system: rhel7 - target: ppc64le - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: clang@9.0.0 - paths: - cc: /usr/tce/packages/clang/clang-9.0.0/bin/clang - cxx: /usr/tce/packages/clang/clang-9.0.0/bin/clang++ - f77: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r - fc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r - flags: {} - operating_system: rhel7 - target: ppc64le - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: clang@9.0.0ibm - paths: - cc: /usr/tce/packages/clang/clang-ibm-2019.10.03/bin/clang - cxx: /usr/tce/packages/clang/clang-ibm-2019.10.03/bin/clang++ - fc: /usr/tce/packages/xl/xl-2020.03.18/bin/xlf2003_r - f77: /usr/tce/packages/xl/xl-2020.03.18/bin/xlf_r - flags: {} - operating_system: rhel7 - target: ppc64le - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: clang@10.0.1ibm - paths: - cc: /usr/tce/packages/clang/clang-ibm-10.0.1/bin/clang - cxx: /usr/tce/packages/clang/clang-ibm-10.0.1/bin/clang++ - fc: /usr/tce/packages/xl/xl-2020.09.17/bin/xlf2003_r - f77: /usr/tce/packages/xl/xl-2020.09.17/bin/xlf_r - flags: {} - operating_system: rhel7 - target: ppc64le - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: clang@coral2018.08.08 - paths: - cc: /usr/tce/packages/clang/clang-coral-2018.08.08/bin/clang - cxx: /usr/tce/packages/clang/clang-coral-2018.08.08/bin/clang++ - f77: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r - fc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r - flags: {} - operating_system: rhel7 - target: ppc64le - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: clang@default - paths: - cc: clang - cxx: clang++ - f77: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r - fc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r - flags: {} - operating_system: rhel7 - target: ppc64le - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: gcc@8.3.1 - paths: - cc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gcc - cxx: /usr/tce/packages/gcc/gcc-8.3.1/bin/g++ - f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - flags: {} - operating_system: rhel7 - target: ppc64le - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: gcc@4.9.3 - paths: - cc: /usr/tce/packages/gcc/gcc-4.9.3/bin/gcc - cxx: /usr/tce/packages/gcc/gcc-4.9.3/bin/g++ - f77: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran - flags: {} - operating_system: rhel7 - target: ppc64le - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: gcc@default - paths: - cc: gcc - cxx: g++ - f77: gfortran - fc: gfortran - flags: {} - operating_system: rhel7 - target: ppc64le - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: xl@default - paths: - cc: xlc - cxx: xlc++ - f77: xlf2003 - fc: xlf2003 - flags: {} - operating_system: rhel7 - target: ppc64le - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: xl@beta2019.06.20 - paths: - cc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlc - cxx: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlc++ - f77: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r - fc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf2003_r - flags: {} - operating_system: rhel7 - target: ppc64le - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: xl@16.1.1.7 - paths: - cc: /usr/tce/packages/xl/xl-2020.03.18/bin/xlc_r - cxx: /usr/tce/packages/xl/xl-2020.03.18/bin/xlC_r - fc: /usr/tce/packages/xl/xl-2020.03.18/bin/xlf2003_r - f77: /usr/tce/packages/xl/xl-2020.03.18/bin/xlf_r - flags: {} - operating_system: rhel7 - target: ppc64le - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: pgi@default - paths: - cc: pgcc - cxx: pgc++ - f77: pgfortran - fc: pgfortran - flags: {} - operating_system: rhel7 - target: ppc64le - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: pgi@19.10 - paths: - cc: /usr/tce/packages/pgi/pgi-19.10/bin/pgcc - cxx: /usr/tce/packages/pgi/pgi-19.10/bin/pgc++ - f77: /usr/tce/packages/pgi/pgi-19.10/bin/pgfortran - fc: /usr/tce/packages/pgi/pgi-19.10/bin/pgfortran - flags: {} - operating_system: rhel7 - target: ppc64le - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: pgi@20.4 - paths: - cc: /usr/tce/packages/pgi/pgi-20.4/bin/pgcc - cxx: /usr/tce/packages/pgi/pgi-20.4/bin/pgc++ - f77: /usr/tce/packages/pgi/pgi-20.4/bin/pgfortran - fc: /usr/tce/packages/pgi/pgi-20.4/bin/pgf90 - flags: {} - operating_system: rhel7 - target: ppc64le - modules: [] - environment: {} - extra_rpaths: [] diff --git a/scripts/radiuss-spack-configs/blueos_3_ppc64le_ib/packages.yaml b/scripts/radiuss-spack-configs/blueos_3_ppc64le_ib/packages.yaml deleted file mode 100644 index 1fe54dcfd..000000000 --- a/scripts/radiuss-spack-configs/blueos_3_ppc64le_ib/packages.yaml +++ /dev/null @@ -1,55 +0,0 @@ -packages: - all: - # This defaults us to machine specific flags of ivybridge which allows - # us to run on broadwell as well - target: [ppc64le] - compiler: [gcc, pgi, clang, xl] - cmake: - version: [3.18.0, 3.14.5] - buildable: false - externals: - - spec: cmake@3.14.5 - prefix: /usr/tce/packages/cmake/cmake-3.14.5 - - spec: cmake@3.18.0 - prefix: /usr/tce/packages/cmake/cmake-3.18.0 - cuda: - version: [11.0.2, 10.1.243, 10.1.168, 9.2.148, 8.0] - buildable: false - externals: - - spec: cuda@11.0.2 - prefix: /usr/tce/packages/cuda/cuda-11.0.2 - - spec: cuda@10.1.243 - prefix: /usr/tce/packages/cuda/cuda-10.1.243 - - spec: cuda@10.1.168 - prefix: /usr/tce/packages/cuda/cuda-10.1.168 - - spec: cuda@9.2.148 - prefix: /usr/tce/packages/cuda/cuda-9.2.148 - - spec: cuda@8.0 - prefix: /usr/tce/packages/cuda/cuda-8.0 - spectrum-mpi: - externals: - - spec: spectrum-mpi@10.3.1.03rtm0%pgi@19.10 - prefix: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-pgi-19.10 - - spec: spectrum-mpi@10.3.1.03rtm0%pgi@20.4 - prefix: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-pgi-20.4 - - spec: spectrum-mpi@10.3.1.03rtm0%gcc@8.3.1 - prefix: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-8.3.1 - - spec: spectrum-mpi@10.3.1.03rtm0%gcc@4.9.3 - prefix: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-4.9.3 - - spec: spectrum-mpi@10.3.1.03rtm0%clang@9.0.0 - prefix: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-clang-9.0.0 - - spec: spectrum-mpi@10.3.1.03rtm0%clang@9.0.0ibm - prefix: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-clang-ibm-2019.10.03 - - spec: spectrum-mpi@10.3.1.03rtm0%clang@10.0.1ibm - prefix: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-clang-ibm-10.0.1 - - spec: spectrum-mpi@10.3.1.03rtm0%xl@16.1.1.7 - prefix: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-xl-2020.03.18 - - spec: spectrum-mpi@10.3.1.03rtm0%xl@beta2019.06.20 - prefix: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-xl-beta-2019.06.20 - buildable: false - python: - buildable: false - version: [3.8.2] - externals: - - spec: python@3.8.2 - prefix: /usr/tce/packages/python/python-3.8.2 diff --git a/scripts/radiuss-spack-configs/blueos_3_ppc64le_ib_p9 b/scripts/radiuss-spack-configs/blueos_3_ppc64le_ib_p9 deleted file mode 120000 index f06fef9d5..000000000 --- a/scripts/radiuss-spack-configs/blueos_3_ppc64le_ib_p9 +++ /dev/null @@ -1 +0,0 @@ -blueos_3_ppc64le_ib \ No newline at end of file diff --git a/scripts/radiuss-spack-configs/config.yaml b/scripts/radiuss-spack-configs/config.yaml deleted file mode 100644 index 2095112ff..000000000 --- a/scripts/radiuss-spack-configs/config.yaml +++ /dev/null @@ -1,80 +0,0 @@ -# ------------------------------------------------------------------------- -# This is the default spack configuration file. -# -# Settings here are versioned with Spack and are intended to provide -# sensible defaults out of the box. Spack maintainers should edit this -# file to keep it current. -# -# Users can override these settings by editing the following files. -# -# Per-spack-instance settings (overrides defaults): -# $SPACK_ROOT/etc/spack/config.yaml -# -# Per-user settings (overrides default and site settings): -# ~/.spack/config.yaml -# ------------------------------------------------------------------------- -config: - # This is the path to the root of the Spack install tree. - # You can use $spack here to refer to the root of the spack instance. - install_tree: $spack/.. - - # install directory layout - install_path_scheme: "${COMPILERNAME}-${COMPILERVER}/${PACKAGE}-${VERSION}" - -# Locations where templates should be found - template_dirs: - - $spack/templates - - # Locations where different types of modules should be installed. - module_roots: - tcl: $spack/share/spack/modules - lmod: $spack/share/spack/lmod - - - # Temporary locations Spack can try to use for builds. - # - # Spack will use the first one it finds that exists and is writable. - # You can use $tempdir to refer to the system default temp directory - # (as returned by tempfile.gettempdir()). - # - # A value of $spack/var/spack/stage indicates that Spack should run - # builds directly inside its install directory without staging them in - # temporary space. - # - # The build stage can be purged with `spack purge --stage`. - build_stage: - # skipping tempdir b/c running mpi tests fails with local fs - # - $tempdir - - $spack/../builds - - - # Cache directory already downloaded source tarballs and archived - # repositories. This can be purged with `spack purge --downloads`. - source_cache: $spack/var/spack/cache - - - # Cache directory for miscellaneous files, like the package index. - # This can be purged with `spack purge --misc-cache` - misc_cache: .spack/misccache - - - # If this is false, tools like curl that use SSL will not verify - # certifiates. (e.g., curl will use use the -k option) - verify_ssl: true - - - # If set to true, Spack will always check checksums after downloading - # archives. If false, Spack skips the checksum step. - checksum: true - - - # If set to true, `spack install` and friends will NOT clean - # potentially harmful variables from the build environment. Use wisely. - dirty: false - - - # The default number of jobs to use when running `make` in parallel. - # If set to 4, for example, `spack install` will run `make -j4`. - # If not set, all available cores are used by default. - # for uberenv, limit build_jobs to 8 - build_jobs: 8 diff --git a/scripts/radiuss-spack-configs/darwin/compilers.yaml b/scripts/radiuss-spack-configs/darwin/compilers.yaml deleted file mode 100644 index ed5cbf020..000000000 --- a/scripts/radiuss-spack-configs/darwin/compilers.yaml +++ /dev/null @@ -1,65 +0,0 @@ -compilers: -- compiler: - environment: {} - extra_rpaths: [] - flags: {} - modules: [] - operating_system: elcapitan - paths: - cc: /usr/bin/clang - cxx: /usr/bin/clang++ - f77: /usr/local/bin/gfortran - fc: /usr/local/bin/gfortran - spec: clang@7.3.0-apple -- compiler: - environment: {} - extra_rpaths: [] - flags: {} - modules: [] - operating_system: sierra - paths: - cc: /usr/bin/clang - cxx: /usr/bin/clang++ - f77: /usr/local/bin/gfortran - fc: /usr/local/bin/gfortran - spec: clang@8.0.0-apple - target: x86_64 -- compiler: - environment: {} - extra_rpaths: [] - flags: {} - modules: [] - operating_system: highsierra - paths: - cc: /usr/bin/clang - cxx: /usr/bin/clang++ - f77: /usr/local/bin/gfortran - fc: /usr/local/bin/gfortran - spec: clang@9.0.0-apple - target: x86_64 -- compiler: - environment: {} - extra_rpaths: [] - flags: {} - modules: [] - operating_system: mojave - paths: - cc: /usr/bin/clang - cxx: /usr/bin/clang++ - f77: /usr/local/bin/gfortran - fc: /usr/local/bin/gfortran - spec: clang@10.0.0-apple - target: x86_64 -- compiler: - environment: {} - extra_rpaths: [] - flags: {} - modules: [] - operating_system: mojave - paths: - cc: /usr/local/opt/llvm/bin/clang - cxx: /usr/local/opt/llvm/bin/clang++ - f77: /usr/local/bin/gfortran - fc: /usr/local/bin/gfortran - spec: clang@10.0.0 - target: x86_64 diff --git a/scripts/radiuss-spack-configs/darwin/packages.yaml b/scripts/radiuss-spack-configs/darwin/packages.yaml deleted file mode 100644 index 6e965957c..000000000 --- a/scripts/radiuss-spack-configs/darwin/packages.yaml +++ /dev/null @@ -1,25 +0,0 @@ - -# ------------------------------------------------------------------------- -# This file controls default concretization preferences for Spack. -# -# Settings here are versioned with Spack and are intended to provide -# sensible defaults out of the box. Spack maintainers should edit this -# file to keep it current. -# -# Users can override these settings by editing the following files. -# -# Per-spack-instance settings (overrides defaults): -# $SPACK_ROOT/etc/spack/packages.yaml -# -# Per-user settings (overrides default and site settings): -# ~/.spack/packages.yaml -# ------------------------------------------------------------------------- -packages: - all: - compiler: [clang] - -# cmake: -# version: [3.17.2] -# paths: -# cmake@3.17.2: /usr/local/Cellar/cmake/3.17.2 -# buildable: false diff --git a/scripts/radiuss-spack-configs/toss_3_x86_64_ib/compilers.yaml b/scripts/radiuss-spack-configs/toss_3_x86_64_ib/compilers.yaml deleted file mode 100644 index e4edc3329..000000000 --- a/scripts/radiuss-spack-configs/toss_3_x86_64_ib/compilers.yaml +++ /dev/null @@ -1,290 +0,0 @@ -compilers: -- compiler: - spec: clang@3.9.1 - paths: - cc: /usr/tce/packages/clang/clang-3.9.1/bin/clang - cxx: /usr/tce/packages/clang/clang-3.9.1/bin/clang++ - f77: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran - flags: {} - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: clang@4.0.0 - paths: - cc: /usr/tce/packages/clang/clang-4.0.0/bin/clang - cxx: /usr/tce/packages/clang/clang-4.0.0/bin/clang++ - f77: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran - flags: {} - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: clang@6.0.0 - paths: - cc: /usr/tce/packages/clang/clang-6.0.0/bin/clang - cxx: /usr/tce/packages/clang/clang-6.0.0/bin/clang++ - f77: /usr/tce/packages/gcc/gcc-8.1.0/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-8.1.0/bin/gfortran - flags: {} - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: clang@9.0.0 - paths: - cc: /usr/tce/packages/clang/clang-9.0.0/bin/clang - cxx: /usr/tce/packages/clang/clang-9.0.0/bin/clang++ - f77: /usr/tce/packages/gcc/gcc-8.1.0/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-8.1.0/bin/gfortran - flags: {} - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: clang@10.0.1 - paths: - cc: /usr/tce/packages/clang/clang-10.0.1/bin/clang - cxx: /usr/tce/packages/clang/clang-10.0.1/bin/clang++ - f77: /usr/tce/packages/gcc/gcc-8.1.0/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-8.1.0/bin/gfortran - flags: {} - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: gcc@4.9.3 - paths: - cc: /usr/tce/packages/gcc/gcc-4.9.3/bin/gcc - cxx: /usr/tce/packages/gcc/gcc-4.9.3/bin/g++ - f77: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-4.9.3/bin/gfortran - flags: {} - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: intel@16.0.4 - paths: - cc: /usr/tce/packages/intel/intel-16.0.4/bin/icc - cxx: /usr/tce/packages/intel/intel-16.0.4/bin/icpc - f77: /usr/tce/packages/intel/intel-16.0.4/bin/ifort - fc: /usr/tce/packages/intel/intel-16.0.4/bin/ifort - flags: - cflags: -gcc-name=/usr/tce/packages/gcc/gcc-4.9.3/bin/gcc - cxxflags: -gcc-name=/usr/tce/packages/gcc/gcc-4.9.3/bin/g++ - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: intel@17.0.2 - paths: - cc: /usr/tce/packages/intel/intel-17.0.2/bin/icc - cxx: /usr/tce/packages/intel/intel-17.0.2/bin/icpc - f77: /usr/tce/packages/intel/intel-17.0.2/bin/ifort - fc: /usr/tce/packages/intel/intel-17.0.2/bin/ifort - flags: - cflags: -gcc-name=/usr/tce/packages/gcc/gcc-4.9.3/bin/gcc - cxxflags: -gcc-name=/usr/tce/packages/gcc/gcc-4.9.3/bin/g++ - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: intel@18.0.0 - paths: - cc: /usr/tce/packages/intel/intel-18.0.0/bin/icc - cxx: /usr/tce/packages/intel/intel-18.0.0/bin/icpc - f77: /usr/tce/packages/intel/intel-18.0.0/bin/ifort - fc: /usr/tce/packages/intel/intel-18.0.0/bin/ifort - flags: - cflags: -gcc-name=/usr/tce/packages/gcc/gcc-7.3.0/bin/gcc - cxxflags: -gcc-name=/usr/tce/packages/gcc/gcc-7.3.0/bin/g++ - fflags: -gcc-name=/usr/tce/packages/gcc/gcc-7.3.0/bin/gcc - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: intel@18.0.2 - paths: - cc: /usr/tce/packages/intel/intel-18.0.2/bin/icc - cxx: /usr/tce/packages/intel/intel-18.0.2/bin/icpc - f77: /usr/tce/packages/intel/intel-18.0.2/bin/ifort - fc: /usr/tce/packages/intel/intel-18.0.2/bin/ifort - flags: - cflags: -gcc-name=/usr/tce/packages/gcc/gcc-7.3.0/bin/gcc - cxxflags: -gcc-name=/usr/tce/packages/gcc/gcc-7.3.0/bin/g++ - fflags: -gcc-name=/usr/tce/packages/gcc/gcc-7.3.0/bin/gcc - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: intel@19.0.4 - paths: - cc: /usr/tce/packages/intel/intel-19.0.4/bin/icc - cxx: /usr/tce/packages/intel/intel-19.0.4/bin/icpc - f77: /usr/tce/packages/intel/intel-19.0.4/bin/ifort - fc: /usr/tce/packages/intel/intel-19.0.4/bin/ifort - flags: - cflags: -gcc-name=/usr/tce/packages/gcc/gcc-7.1.0/bin/gcc - cxxflags: -gcc-name=/usr/tce/packages/gcc/gcc-7.1.0/bin/g++ - fflags: -gcc-name=/usr/tce/packages/gcc/gcc-7.1.0/bin/g++ - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: intel@19.1.0 - paths: - cc: /usr/tce/packages/intel/intel-19.1.0/bin/icc - cxx: /usr/tce/packages/intel/intel-19.1.0/bin/icpc - f77: /usr/tce/packages/intel/intel-19.1.0/bin/ifort - fc: /usr/tce/packages/intel/intel-19.1.0/bin/ifort - flags: - cflags: -gcc-name=/usr/tce/packages/gcc/gcc-8.1.0/bin/gcc - cxxflags: -gcc-name=/usr/tce/packages/gcc/gcc-8.1.0/bin/g++ - fflags: -gcc-name=/usr/tce/packages/gcc/gcc-8.1.0/bin/gcc - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: pgi@17.10 - paths: - cc: /usr/tce/packages/pgi/pgi-17.10/bin/pgcc - cxx: /usr/tce/packages/pgi/pgi-17.10/bin/pgc++ - f77: /usr/tce/packages/pgi/pgi-17.10/bin/pgf77 - fc: /usr/tce/packages/pgi/pgi-17.10/bin/pgf95 - flags: {} - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: pgi@18.5 - paths: - cc: /usr/tce/packages/pgi/pgi-18.5/bin/pgcc - cxx: /usr/tce/packages/pgi/pgi-18.5/bin/pgc++ - f77: /usr/tce/packages/pgi/pgi-18.5/bin/pgf77 - fc: /usr/tce/packages/pgi/pgi-18.5/bin/pgf95 - flags: {} - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: pgi@19.4 - paths: - cc: /usr/tce/packages/pgi/pgi-19.4/bin/pgcc - cxx: /usr/tce/packages/pgi/pgi-19.4/bin/pgc++ - f77: /usr/tce/packages/pgi/pgi-19.4/bin/pgfortran - fc: /usr/tce/packages/pgi/pgi-19.4/bin/pgfortran - flags: {} - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: pgi@19.7 - paths: - cc: /usr/tce/packages/pgi/pgi-19.7/bin/pgcc - cxx: /usr/tce/packages/pgi/pgi-19.7/bin/pgc++ - f77: /usr/tce/packages/pgi/pgi-19.7/bin/pgfortran - fc: /usr/tce/packages/pgi/pgi-19.7/bin/pgf95 - flags: {} - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: pgi@20.1 - paths: - cc: /usr/tce/packages/pgi/pgi-20.1/bin/pgcc - cxx: /usr/tce/packages/pgi/pgi-20.1/bin/pgc++ - f77: /usr/tce/packages/pgi/pgi-20.1/bin/pgfortran - fc: /usr/tce/packages/pgi/pgi-20.1/bin/pgf95 - flags: {} - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: gcc@6.1.0 - paths: - cc: /usr/tce/packages/gcc/gcc-6.1.0/bin/gcc - cxx: /usr/tce/packages/gcc/gcc-6.1.0/bin/g++ - f77: /usr/tce/packages/gcc/gcc-6.1.0/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-6.1.0/bin/gfortran - flags: {} - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: gcc@7.1.0 - paths: - cc: /usr/tce/packages/gcc/gcc-7.1.0/bin/gcc - cxx: /usr/tce/packages/gcc/gcc-7.1.0/bin/g++ - f77: /usr/tce/packages/gcc/gcc-7.1.0/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-7.1.0/bin/gfortran - flags: {} - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: gcc@7.3.0 - paths: - cc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gcc - cxx: /usr/tce/packages/gcc/gcc-7.3.0/bin/g++ - f77: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran - flags: {} - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: gcc@8.1.0 - paths: - cc: /usr/tce/packages/gcc/gcc-8.1.0/bin/gcc - cxx: /usr/tce/packages/gcc/gcc-8.1.0/bin/g++ - f77: /usr/tce/packages/gcc/gcc-8.1.0/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-8.1.0/bin/gfortran - flags: {} - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] diff --git a/scripts/radiuss-spack-configs/toss_3_x86_64_ib/packages.yaml b/scripts/radiuss-spack-configs/toss_3_x86_64_ib/packages.yaml deleted file mode 100644 index 96462e950..000000000 --- a/scripts/radiuss-spack-configs/toss_3_x86_64_ib/packages.yaml +++ /dev/null @@ -1,90 +0,0 @@ -packages: - all: - # This defaults us to machine specific flags of ivybridge which allows - # us to run on broadwell as well - target: [ivybridge] - compiler: [gcc, intel, pgi, clang] - cmake: - version: [3.14.5] - buildable: false - - externals: - - spec: cmake - prefix: /usr/tce/packages/cmake/cmake-3.14.5 - cuda: - version: [10.1.168] - buildable: false - - externals: - - spec: cuda@10.1.168 - prefix: /usr/tce/packages/cuda/cuda-10.1.168 - hip: - version: [4.0.0, 4.1.0] - buildable: false - externals: - - spec: hip@4.0.0 - prefix: /opt/rocm-4.0.0/hip - - spec: hip@4.1.0 - prefix: /opt/rocm-4.1.0/hip - llvm-amdgpu: - version: [4.0.0, 4.1.0] - buildable: false - externals: - - spec: llvm-amdgpu@4.0.0 - prefix: /opt/rocm-4.0.0/llvm - - spec: llvm-amdgpu@4.1.0 - prefix: /opt/rocm-4.1.0/llvm - hsa-rocr-dev: - version: [4.0.0, 4.1.0] - buildable: false - externals: - - spec: hsa-rocr-dev@4.0.0 - prefix: /opt/rocm-4.0.0/ - - spec: hsa-rocr-dev@4.1.0 - prefix: /opt/rocm-4.1.0/ - rocminfo: - version: [4.0.0, 4.1.0] - buildable: false - externals: - - spec: rocminfo@4.0.0 - prefix: /opt/rocm-4.0.0/ - - spec: rocminfo@4.1.0 - prefix: /opt/rocm-4.1.0/ - rocm-device-libs: - version: [4.0.0, 4.1.0] - buildable: false - externals: - - spec: rocm-device-libs@4.0.0 - prefix: /opt/rocm-4.0.0/ - - spec: rocm-device-libs@4.1.0 - prefix: /opt/rocm-4.1.0/ - mvapich2: - externals: - - spec: mvapich2@2.3.1%clang@10.0.0~cuda~debug~regcache~wrapperrpath ch3_rank_bits=32 - file_systems=lustre,nfs,ufs process_managers=slurm - prefix: /usr/tce/packages/mvapich2/mvapich2-2.3-clang-10.0.0 - - spec: mvapich2@2.3.1%clang@9.0.0~cuda~debug~regcache~wrapperrpath ch3_rank_bits=32 - file_systems=lustre,nfs,ufs process_managers=slurm - prefix: /usr/tce/packages/mvapich2/mvapich2-2.3-clang-9.0.0 - - spec: mvapich2@2.3.1%pgi@19.7~cuda~debug~regcache~wrapperrpath ch3_rank_bits=32 - file_systems=lustre,nfs,ufs process_managers=slurm - prefix: /usr/tce/packages/mvapich2/mvapich2-2.3-pgi-19.7 - - spec: mvapich2@2.3.1%pgi@20.1~cuda~debug~regcache~wrapperrpath ch3_rank_bits=32 - file_systems=lustre,nfs,ufs process_managers=slurm - prefix: /usr/tce/packages/mvapich2/mvapich2-2.3-pgi-20.1 - - spec: mvapich2@2.3.1%intel@19.1.0.166~cuda~debug~regcache~wrapperrpath ch3_rank_bits=32 - file_systems=lustre,nfs,ufs process_managers=slurm - prefix: /usr/tce/packages/mvapich2/mvapich2-2.3-intel-19.1.0 - - spec: mvapich2@2.3.1%intel@18.0.2~cuda~debug~regcache~wrapperrpath ch3_rank_bits=32 - file_systems=lustre,nfs,ufs process_managers=slurm - prefix: /usr/tce/packages/mvapich2/mvapich2-2.3-intel-18.0.2 - - spec: mvapich2@2.3.1%intel@17.0.2~cuda~debug~regcache~wrapperrpath ch3_rank_bits=32 - file_systems=lustre,nfs,ufs process_managers=slurm - prefix: /usr/tce/packages/mvapich2/mvapich2-2.3-intel-17.0.2 - - spec: mvapich2@2.3.1%gcc@8.1.0~cuda~debug~regcache~wrapperrpath ch3_rank_bits=32 - file_systems=lustre,nfs,ufs process_managers=slurm - prefix: /usr/tce/packages/mvapich2/mvapich2-2.3-gcc-8.1.0 - - spec: mvapich2@2.3.1%gcc@4.9.3~cuda~debug~regcache~wrapperrpath ch3_rank_bits=32 - file_systems=lustre,nfs,ufs process_managers=slurm - prefix: /usr/tce/packages/mvapich2/mvapich2-2.3-gcc-4.9.3 - buildable: false diff --git a/scripts/uberenv/packages/raja_perfsuite/package.py b/scripts/spack_packages/raja_perf/package.py similarity index 93% rename from scripts/uberenv/packages/raja_perfsuite/package.py rename to scripts/spack_packages/raja_perf/package.py index c4564aad7..00ac78959 100644 --- a/scripts/uberenv/packages/raja_perfsuite/package.py +++ b/scripts/spack_packages/raja_perf/package.py @@ -51,14 +51,29 @@ def get_spec_path(spec, package_name, path_replacements = {}, use_bin = False) : return path -class RajaPerfSuite(CMakePackage, CudaPackage): - """RAJA PerfSuite.""" +class Raja(CMakePackage, CudaPackage): + """RAJA Parallel Framework.""" - homepage = "https://github.com/LLNL/RAJAPerf/" - git = "https://github.com/LLNL/RAJAPerf/" + homepage = "http://software.llnl.gov/RAJA/" + git = "https://github.com/LLNL/RAJA.git" version('develop', branch='develop', submodules='True') version('main', branch='main', submodules='True') + version('0.12.1', tag='v0.12.1', submodules="True") + version('0.12.0', tag='v0.12.0', submodules="True") + version('0.11.0', tag='v0.11.0', submodules="True") + version('0.10.1', tag='v0.10.1', submodules="True") + version('0.10.0', tag='v0.10.0', submodules="True") + version('0.9.0', tag='v0.9.0', submodules="True") + version('0.8.0', tag='v0.8.0', submodules="True") + version('0.7.0', tag='v0.7.0', submodules="True") + version('0.6.0', tag='v0.6.0', submodules="True") + version('0.5.3', tag='v0.5.3', submodules="True") + version('0.5.2', tag='v0.5.2', submodules="True") + version('0.5.1', tag='v0.5.1', submodules="True") + version('0.5.0', tag='v0.5.0', submodules="True") + version('0.4.1', tag='v0.4.1', submodules="True") + version('0.4.0', tag='v0.4.0', submodules="True") variant('openmp', default=True, description='Build OpenMP backend') variant('shared', default=False, description='Build Shared Libs') diff --git a/scripts/uberenv b/scripts/uberenv new file mode 160000 index 000000000..105e384f5 --- /dev/null +++ b/scripts/uberenv @@ -0,0 +1 @@ +Subproject commit 105e384f585e2391c42b2def93124a6580319c1c diff --git a/scripts/uberenv/LICENSE b/scripts/uberenv/LICENSE deleted file mode 100644 index fcd00312e..000000000 --- a/scripts/uberenv/LICENSE +++ /dev/null @@ -1,64 +0,0 @@ -Copyright (c) 2014-2018, Lawrence Livermore National Security, LLC. - -Produced at the Lawrence Livermore National Laboratory - -LLNL-CODE-666778 - -All rights reserved. - -This file is part of Conduit. - -For details, see: http://software.llnl.gov/conduit/. - -Please also read conduit/LICENSE - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above copyright notice, - this list of conditions and the disclaimer below. - -* Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the disclaimer (as noted below) in the - documentation and/or other materials provided with the distribution. - -* Neither the name of the LLNS/LLNL nor the names of its contributors may - be used to endorse or promote products derived from this software without - specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, -LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY -DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING -IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - -Additional BSD Notice - - 1. This notice is required to be provided under our contract with the U.S. - Department of Energy (DOE). This work was produced at Lawrence - Livermore National Laboratory under Contract No. DE-AC52-07NA27344 with - the DOE. - - 2. Neither the United States Government nor Lawrence Livermore National - Security, LLC nor any of their employees, makes any warranty, express - or implied, or assumes any liability or responsibility for the - accuracy, completeness, or usefulness of any information, apparatus, - product, or process disclosed, or represents that its use would not - infringe privately-owned rights. - - 3. Also, reference herein to any specific commercial products, process, - or services by trade name, trademark, manufacturer or otherwise does - not necessarily constitute or imply its endorsement, recommendation, - or favoring by the United States Government or Lawrence Livermore - National Security, LLC. The views and opinions of authors expressed - herein do not necessarily state or reflect those of the United - States Government or Lawrence Livermore National Security, LLC, and - shall not be used for advertising or product endorsement purposes. - diff --git a/scripts/uberenv/Makefile b/scripts/uberenv/Makefile deleted file mode 100644 index 2760762d1..000000000 --- a/scripts/uberenv/Makefile +++ /dev/null @@ -1,6 +0,0 @@ - -default: - sphinx-build -E -a -b html docs/sphinx/ _docs_html - -clean: - rm -rf _docs_html \ No newline at end of file diff --git a/scripts/uberenv/README.md b/scripts/uberenv/README.md deleted file mode 100644 index 82d682017..000000000 --- a/scripts/uberenv/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# uberenv -Automates using Spack (https://www.spack.io/) to build and deploy software. - -Uberenv is a short python script that helps automate using Spack to build -third-party dependencies for development and to deploy Spack packages. - -Uberenv was released as part of the Conduit (https://github.com/LLNL/conduit/). It is included in-source in several projects, this repo is used to hold the latest reference version. - -For more details, see Uberenv's documention: - -https://uberenv.readthedocs.io - -You can also find details about how it is used in Conduit's documentation: - -https://llnl-conduit.readthedocs.io/en/latest/building.html#building-conduit-and-third-party-dependencies - -Conduit's source repo also serves as an example for uberenv and spack configuration files, etc: - -https://github.com/LLNL/conduit/tree/master/scripts/uberenv diff --git a/scripts/uberenv/docs/sphinx/conf.py b/scripts/uberenv/docs/sphinx/conf.py deleted file mode 100644 index a8475c7b8..000000000 --- a/scripts/uberenv/docs/sphinx/conf.py +++ /dev/null @@ -1,324 +0,0 @@ -# -*- coding: utf-8 -*- -# -############################################################################### -# Copyright (c) 2015-2019, Lawrence Livermore National Security, LLC. -# -# Produced at the Lawrence Livermore National Laboratory -# -# LLNL-CODE-666778 -# -# All rights reserved. -# -# This file is part of Conduit. -# -# For details, see: http://software.llnl.gov/conduit/. -# -# Please also read conduit/LICENSE -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the disclaimer below. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the disclaimer (as noted below) in the -# documentation and/or other materials provided with the distribution. -# -# * Neither the name of the LLNS/LLNL nor the names of its contributors may -# be used to endorse or promote products derived from this software without -# specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, -# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY -# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING -# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. -# -############################################################################### -# -# Uberenv documentation build configuration file, created by -# sphinx-quickstart on Thu Oct 16 11:23:46 2014. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -import sys -import os - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) - -# -- General configuration ------------------------------------------------ - -# If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.doctest', - 'sphinx.ext.todo', - 'sphinx.ext.coverage', - 'sphinx.ext.mathjax' -] - -# try to add the breathe extension -try: - import breathe - extensions.append('breathe') -except: - pass - -# Add any paths that contain templates here, relative to this directory. -# templates_path = ['@CMAKE_CURRENT_SOURCE_DIR@/_templates'] - -# The suffix of source filenames. -source_suffix = '.rst' - -# The encoding of source files. -#source_encoding = 'utf-8-sig' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = u'Uberenv' -copyright = u'Copyright (c) 2015-2019, LLNS' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -version = 'current' -# The full version, including alpha/beta/rc tags. -release = 'current' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -#language = None - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -#today = '' -# Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -exclude_patterns = ['_build'] - -# The reST default role (used for this markup: `text`) to use for all -# documents. -#default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -#add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -#show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] - -# If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False - - -# -- Options for HTML output ---------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -html_theme = 'sphinx_rtd_theme' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -html_theme_options = { 'logo_only' : True } - -# Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] - -# The name for this set of Sphinx documents. If None, it defaults to -# " v documentation". -#html_title = None - -# A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None - -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. -#html_logo = - -# The name of an image file (within the static path) to use as favicon of the -# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -#html_favicon = None - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -# html_static_path = ['@CMAKE_CURRENT_SOURCE_DIR@/_static'] - -# Add any extra paths that contain custom files (such as robots.txt or -# .htaccess) here, relative to this directory. These files are copied -# directly to the root of the documentation. -#html_extra_path = [] - -# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, -# using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -#html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -#html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -#html_additional_pages = {} - -# If false, no module index is generated. -#html_domain_indices = True - -# If false, no index is generated. -#html_use_index = True - -# If true, the index is split into individual pages for each letter. -#html_split_index = False - -# If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True - -# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True - -# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -#html_use_opensearch = '' - -# This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None - -# Output file base name for HTML help builder. -htmlhelp_basename = 'Uberenvdoc' - - -# -- Options for LaTeX output --------------------------------------------- - -latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - ('index', 'Uberenv.tex', u'Uberenv Documentation', - u'LLNS', 'manual'), -] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -#latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -#latex_use_parts = False - -# If true, show page references after internal links. -#latex_show_pagerefs = False - -# If true, show URL addresses after external links. -#latex_show_urls = False - -# Documents to append as an appendix to all manuals. -#latex_appendices = [] - -# If false, no module index is generated. -#latex_domain_indices = True - - -# -- Options for manual page output --------------------------------------- - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'uberenv', u'Uberenv Documentation', - [u'LLNS'], 1) -] - -# If true, show URL addresses after external links. -#man_show_urls = False - - -# -- Options for Texinfo output ------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - ('index', 'Uberenv', u'Uberenv Documentation', - u'LLNS', 'Uberenv', 'Automates using spack to build and deploy software.', - 'Miscellaneous'), -] - -# Documents to append as an appendix to all manuals. -#texinfo_appendices = [] - -# If false, no module index is generated. -#texinfo_domain_indices = True - -# How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' - -# If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False - - -# try to use the read the docs theme -try: - import sphinx_rtd_theme - html_theme = "sphinx_rtd_theme" - html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] -except: - pass diff --git a/scripts/uberenv/docs/sphinx/index.rst b/scripts/uberenv/docs/sphinx/index.rst deleted file mode 100644 index 457ec596d..000000000 --- a/scripts/uberenv/docs/sphinx/index.rst +++ /dev/null @@ -1,194 +0,0 @@ -.. ############################################################################ -.. # Copyright (c) 2014-2018, Lawrence Livermore National Security, LLC. -.. # -.. # Produced at the Lawrence Livermore National Laboratory -.. # -.. # LLNL-CODE-666778 -.. # -.. # All rights reserved. -.. # -.. # This file is part of Conduit. -.. # -.. # For details, see: http://software.llnl.gov/conduit/. -.. # -.. # Please also read conduit/LICENSE -.. # -.. # Redistribution and use in source and binary forms, with or without -.. # modification, are permitted provided that the following conditions are met: -.. # -.. # * Redistributions of source code must retain the above copyright notice, -.. # this list of conditions and the disclaimer below. -.. # -.. # * Redistributions in binary form must reproduce the above copyright notice, -.. # this list of conditions and the disclaimer (as noted below) in the -.. # documentation and/or other materials provided with the distribution. -.. # -.. # * Neither the name of the LLNS/LLNL nor the names of its contributors may -.. # be used to endorse or promote products derived from this software without -.. # specific prior written permission. -.. # -.. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -.. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -.. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -.. # ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, -.. # LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY -.. # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -.. # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -.. # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -.. # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -.. # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING -.. # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -.. # POSSIBILITY OF SUCH DAMAGE. -.. # -.. ############################################################################ - -.. _building_with_uberenv: - -Uberenv -~~~~~~~~~~~~~~~ - -**Uberenv** automates using `Spack `_ to build and deploy software. - -Many projects leverage `Spack `_ to help build the software dependencies needed to develop and deploy their projects on HPC systems. Uberenv is a python script that helps automate using Spack to build -third-party dependencies for development and to deploy Spack packages. - -Uberenv was released as part of Conduit (https://github.com/LLNL/conduit/). It is included in-source in several projects. The -https://github.com/llnl/uberenv/ repo is used to hold the latest reference version of Uberenv. - - -uberenv.py -~~~~~~~~~~~~~~~~~~~~~ - -``uberenv.py`` is a single file python script that automates fetching Spack, building and installing third party dependencies, and can optionally install packages as well. To automate the full install process, ``uberenv.py`` uses a target Spack package along with extra settings such as Spack compiler and external third party package details for common HPC platforms. - -``uberenv.py`` is included directly in a project's source code repo in the folder: ``scripts/uberenv/`` -This folder is also used to store extra Spack and Uberenv configuration files unique to the target project. ``uberenv.py`` uses a ``project.json`` file to specify project details, including the target Spack package name and which Spack repo is used. Conduit's source repo serves as an example for Uberenv and Spack configuration files, etc: - -https://github.com/LLNL/conduit/tree/master/scripts/uberenv - - -``uberenv.py`` is developed by LLNL in support of the `Ascent `_, Axom, and `Conduit `_ projects. - - -Command Line Options -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Build configuration -------------------- - -``uberenv.py`` has a few options that allow you to control how dependencies are built: - - ======================= ============================================== ================================================ - Option Description Default - ======================= ============================================== ================================================ - ``--prefix`` Destination directory ``uberenv_libs`` - ``--spec`` Spack spec linux: **%gcc** - osx: **%clang** - ``--spack-config-dir`` Folder with Spack settings files linux: (empty) - osx: ``scripts/uberenv/spack_configs/darwin/`` - ``-k`` Ignore SSL Errors **False** - ``--install`` Fully install target, not just dependencies **False** - ``--run_tests`` Invoke tests during build and against install **False** - ``--project-json`` File for project specific settings ``project.json`` - ======================= ============================================== ================================================ - -The ``-k`` option exists for sites where SSL certificate interception undermines fetching -from github and https hosted source tarballs. When enabled, ``uberenv.py`` clones Spack using: - -.. code:: bash - - git -c http.sslVerify=false clone https://github.com/llnl/spack.git - -And passes ``-k`` to any Spack commands that may fetch via https. - - -Default invocation on Linux: - -.. code:: bash - - python scripts/uberenv/uberenv.py --prefix uberenv_libs \ - --spec %gcc - -Default invocation on OSX: - -.. code:: bash - - python scripts/uberenv/uberenv.py --prefix uberenv_libs \ - --spec %clang \ - --spack-config-dir scripts/uberenv/spack_configs/darwin/ - - -Use the ``--install`` option to install the target package (not just its development dependencies): - -.. code:: bash - - python scripts/uberenv/uberenv.py --install - - -If the target Spack package supports Spack's testing hooks, you can run tests during the build process to validate the build and install, using the ``--run_tests`` option: - -.. code:: bash - - python scripts/uberenv/uberenv.py --install \ - --run_tests - -For details on Spack's spec syntax, see the `Spack Specs & dependencies `_ documentation. - - -Uberenv looks for configuration yaml files under ``scripts/uberenv/spack_config/{platform}`` or you can use the **--spack-config-dir** option to specify a directory with compiler and packages yaml files to use with Spack. See the `Spack Compiler Configuration `_ -and `Spack System Packages -`_ -documentation for details. - -.. note:: - The bootstrapping process ignores ``~/.spack/compilers.yaml`` to avoid conflicts - and surprises from a user's specific Spack settings on HPC platforms. - -When run, ``uberenv.py`` checkouts a specific version of Spack from github as ``spack`` in the -destination directory. It then uses Spack to build and install the target packages' dependencies into -``spack/opt/spack/``. Finally, the target package generates a host-config file ``{hostname}.cmake``, which is -copied to destination directory. This file specifies the compiler settings and paths to all of the dependencies. - - -Project configuration ---------------------- - -Part of the configuration can also be addressed using a json file. By default, it is named ``project.json`` and some settings can be overridden on command line: - - ==================== ========================== ================================================ ======================================= - Setting Option Description Default - ==================== ========================== ================================================ ======================================= - package_name ``--package-name`` Spack package name **None** - package_version **None** Spack package version **None** - package_final_phase ``--package-final-phase`` Controls after which phase Spack should stop **None** - package_source_dir ``--package-source-dir`` Controls the source directory Spack should use **None** - spack_url **None** Url where to download Spack ``https://github.com/spack/spack.git`` - spack_commit **None** Spack commit to checkout **None** - spack_activate **None** Spack packages to activate **None** - ==================== ========================== ================================================ ======================================= - - -Optimization ------------- - -``uberenv.py`` also features options to optimize the installation - - ==================== ============================================== ================================================ - Option Description Default - ==================== ============================================== ================================================ - ``--mirror`` Location of a Spack mirror **None** - ``--create-mirror`` Creates a Spack mirror at specified location **None** - ``--upstream`` Location of a Spack upstream **None** - ==================== ============================================== ================================================ - - -Project Settings -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -A few notes on using ``uberenv.py`` in a new project: - -* For an example of how to craft a ``project.json`` file a target project, see: `Conduit's project.json file `_ - -* ``uberenv.py`` hot copies ``packages`` to the cloned Spack install, this allows you to easily version control any Spack package overrides necessary - - diff --git a/scripts/uberenv/gen_spack_env_script.py b/scripts/uberenv/gen_spack_env_script.py deleted file mode 100644 index a1e6ba5d0..000000000 --- a/scripts/uberenv/gen_spack_env_script.py +++ /dev/null @@ -1,128 +0,0 @@ -############################################################################### -# Copyright (c) 2015-2019, Lawrence Livermore National Security, LLC. -# -# Produced at the Lawrence Livermore National Laboratory -# -# LLNL-CODE-716457 -# -# All rights reserved. -# -# This file is part of Ascent. -# -# For details, see: http://ascent.readthedocs.io/. -# -# Please also read ascent/LICENSE -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the disclaimer below. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the disclaimer (as noted below) in the -# documentation and/or other materials provided with the distribution. -# -# * Neither the name of the LLNS/LLNL nor the names of its contributors may -# be used to endorse or promote products derived from this software without -# specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, -# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY -# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING -# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. -# -############################################################################### -import os -import sys -import subprocess - -from os.path import join as pjoin - -# if you have bad luck with spack load, this -# script is for you! -# -# Looks for subdir: spack or uberenv_libs/spack -# queries spack for given package names and -# creates a bash script that adds those to your path -# -# -# usage: -# python gen_spack_env_script.py [spack_pkg_1 spack_pkg_2 ...] -# - -def sexe(cmd,ret_output=False,echo = True): - """ Helper for executing shell commands. """ - if echo: - print("[exe: {}]".format(cmd)) - if ret_output: - p = subprocess.Popen(cmd, - shell=True, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - res = p.communicate()[0] - res = res.decode('utf8') - return p.returncode,res - else: - return subprocess.call(cmd,shell=True) - - -def spack_exe(spath=None): - if spath is None: - to_try = [pjoin("uberenv_libs","spack"), "spack"] - for p in to_try: - abs_p = os.path.abspath(p) - print("[looking for spack directory at: {}]".format(abs_p)) - if os.path.isdir(abs_p): - print("[FOUND spack directory at: {}]".format(abs_p)) - return os.path.abspath(pjoin(abs_p,"bin","spack")) - print("[ERROR: failed to find spack directory!]") - sys.exit(-1) - else: - spack_exe = os.path.abspath(spath,"bin","spack") - if not os.path.isfile(spack_exec): - print("[ERROR: failed to find spack directory at spath={}]").format(spath) - sys.exit(-1) - return spack_exe - -def find_pkg(pkg_name): - r,rout = sexe(spack_exe() + " find -p " + pkg_name,ret_output = True) - print(rout) - for l in rout.split("\n"): - print(l) - lstrip = l.strip() - if not lstrip == "" and \ - not lstrip.startswith("==>") and \ - not lstrip.startswith("--"): - return {"name": pkg_name, "path": l.split()[-1]} - print("[ERROR: failed to find package named '{}']".format(pkg_name)) - sys.exit(-1) - -def path_cmd(pkg): - return('export PATH={}:$PATH\n'.format((pjoin(pkg["path"],"bin")))) - -def write_env_script(pkgs): - ofile = open("s_env.sh","w") - for p in pkgs: - print("[found {} at {}]".format(p["name"],p["path"])) - ofile.write("# {}\n".format(p["name"])) - ofile.write(path_cmd(p)) - print("[created {}]".format(os.path.abspath("s_env.sh"))) - -def main(): - pkgs = [find_pkg(pkg) for pkg in sys.argv[1:]] - if len(pkgs) > 0: - write_env_script(pkgs) - else: - print("usage: python gen_spack_env_script.py spack_pkg_1 spack_pkg_2 ...") - -if __name__ == "__main__": - main() diff --git a/scripts/uberenv/packages/chai/package.py b/scripts/uberenv/packages/chai/package.py deleted file mode 100644 index b20b1ac9f..000000000 --- a/scripts/uberenv/packages/chai/package.py +++ /dev/null @@ -1,243 +0,0 @@ -# Copyright 2013-2020 Lawrence Livermore National Security, LLC and other -# Spack Project Developers. See the top-level COPYRIGHT file for details. -# -# SPDX-License-Identifier: (Apache-2.0 OR MIT) - - -from spack import * - -import socket -import os - -from os import environ as env -from os.path import join as pjoin - -def cmake_cache_entry(name, value, comment=""): - """Generate a string for a cmake cache variable""" - - return 'set(%s "%s" CACHE PATH "%s")\n\n' % (name,value,comment) - - -def cmake_cache_string(name, string, comment=""): - """Generate a string for a cmake cache variable""" - - return 'set(%s "%s" CACHE STRING "%s")\n\n' % (name,string,comment) - - -def cmake_cache_option(name, boolean_value, comment=""): - """Generate a string for a cmake configuration option""" - - value = "ON" if boolean_value else "OFF" - return 'set(%s %s CACHE BOOL "%s")\n\n' % (name,value,comment) - - -def get_spec_path(spec, package_name, path_replacements = {}, use_bin = False) : - """Extracts the prefix path for the given spack package - path_replacements is a dictionary with string replacements for the path. - """ - - if not use_bin: - path = spec[package_name].prefix - else: - path = spec[package_name].prefix.bin - - path = os.path.realpath(path) - - for key in path_replacements: - path = path.replace(key, path_replacements[key]) - - return path - - -class Chai(CMakePackage, CudaPackage): - """ - Copy-hiding array interface for data migration between memory spaces - """ - - homepage = "https://github.com/LLNL/CHAI" - git = "https://github.com/LLNL/CHAI.git" - - version('develop', branch='develop', submodules='True') - version('main', branch='main', submodules='True') - version('2.1.1', tag='v2.1.1', submodules='True') - version('2.1.0', tag='v2.1.0', submodules='True') - version('2.0.0', tag='v2.0.0', submodules='True') - version('1.2.0', tag='v1.2.0', submodules='True') - version('1.1.0', tag='v1.1.0', submodules='True') - version('1.0', tag='v1.0', submodules='True') - - variant('shared', default=True, description='Build Shared Libs') - - depends_on('cmake@3.8:', type='build') - depends_on('umpire') - - depends_on('cmake@3.9:', type='build', when="+cuda") - depends_on('umpire+cuda', when="+cuda") - - phases = ['hostconfig', 'cmake', 'build','install'] - - def _get_sys_type(self, spec): - sys_type = str(spec.architecture) - # if on llnl systems, we can use the SYS_TYPE - if "SYS_TYPE" in env: - sys_type = env["SYS_TYPE"] - return sys_type - - def _get_host_config_path(self, spec): - var='' - if '+cuda' in spec: - var= '-'.join([var,'cuda']) - - host_config_path = "hc-%s-%s-%s%s-%s.cmake" % (socket.gethostname().rstrip('1234567890'), - self._get_sys_type(spec), - spec.compiler, - var, - spec.dag_hash()) - dest_dir = self.stage.source_path - host_config_path = os.path.abspath(pjoin(dest_dir, host_config_path)) - return host_config_path - - def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): - """ - This method creates a 'host-config' file that specifies - all of the options used to configure and build CHAI. - - For more details about 'host-config' files see: - http://software.llnl.gov/conduit/building.html - - Note: - The `py_site_pkgs_dir` arg exists to allow a package that - subclasses this package provide a specific site packages - dir when calling this function. `py_site_pkgs_dir` should - be an absolute path or `None`. - - This is necessary because the spack `site_packages_dir` - var will not exist in the base class. For more details - on this issue see: https://github.com/spack/spack/issues/6261 - """ - - ####################### - # Compiler Info - ####################### - c_compiler = env["SPACK_CC"] - cpp_compiler = env["SPACK_CXX"] - - # Even though we don't have fortran code in our project we sometimes - # use the Fortran compiler to determine which libstdc++ to use - f_compiler = "" - if "SPACK_FC" in env.keys(): - # even if this is set, it may not exist - # do one more sanity check - if os.path.isfile(env["SPACK_FC"]): - f_compiler = env["SPACK_FC"] - - ####################################################################### - # By directly fetching the names of the actual compilers we appear - # to doing something evil here, but this is necessary to create a - # 'host config' file that works outside of the spack install env. - ####################################################################### - - sys_type = self._get_sys_type(spec) - - ############################################## - # Find and record what CMake is used - ############################################## - - cmake_exe = spec['cmake'].command.path - cmake_exe = os.path.realpath(cmake_exe) - - host_config_path = self._get_host_config_path(spec) - cfg = open(host_config_path, "w") - cfg.write("###################\n".format("#" * 60)) - cfg.write("# Generated host-config - Edit at own risk!\n") - cfg.write("###################\n".format("#" * 60)) - cfg.write("# Copyright (c) 2020, Lawrence Livermore National Security, LLC and\n") - cfg.write("# other CHAI Project Developers. See the top-level LICENSE file for\n") - cfg.write("# details.\n") - cfg.write("#\n") - cfg.write("# SPDX-License-Identifier: (BSD-3-Clause) \n") - cfg.write("###################\n\n".format("#" * 60)) - - cfg.write("#------------------\n".format("-" * 60)) - cfg.write("# SYS_TYPE: {0}\n".format(sys_type)) - cfg.write("# Compiler Spec: {0}\n".format(spec.compiler)) - cfg.write("# CMake executable path: %s\n" % cmake_exe) - cfg.write("#------------------\n\n".format("-" * 60)) - - ####################### - # Compiler Settings - ####################### - - cfg.write("#------------------\n".format("-" * 60)) - cfg.write("# Compilers\n") - cfg.write("#------------------\n\n".format("-" * 60)) - cfg.write(cmake_cache_entry("CMAKE_C_COMPILER", c_compiler)) - cfg.write(cmake_cache_entry("CMAKE_CXX_COMPILER", cpp_compiler)) - - # use global spack compiler flags - cflags = ' '.join(spec.compiler_flags['cflags']) - if cflags: - cfg.write(cmake_cache_entry("CMAKE_C_FLAGS", cflags)) - - cxxflags = ' '.join(spec.compiler_flags['cxxflags']) - if cxxflags: - cfg.write(cmake_cache_entry("CMAKE_CXX_FLAGS", cxxflags)) - - if ("gfortran" in f_compiler) and ("clang" in cpp_compiler): - libdir = pjoin(os.path.dirname( - os.path.dirname(f_compiler)), "lib") - flags = "" - for _libpath in [libdir, libdir + "64"]: - if os.path.exists(_libpath): - flags += " -Wl,-rpath,{0}".format(_libpath) - description = ("Adds a missing libstdc++ rpath") - if flags: - cfg.write(cmake_cache_entry("BLT_EXE_LINKER_FLAGS", flags, - description)) - - if "+cuda" in spec: - cfg.write("#------------------{0}\n".format("-" * 60)) - cfg.write("# Cuda\n") - cfg.write("#------------------{0}\n\n".format("-" * 60)) - - cfg.write(cmake_cache_option("ENABLE_CUDA", True)) - - cudatoolkitdir = spec['cuda'].prefix - cfg.write(cmake_cache_entry("CUDA_TOOLKIT_ROOT_DIR", - cudatoolkitdir)) - cudacompiler = "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc" - cfg.write(cmake_cache_entry("CMAKE_CUDA_COMPILER", - cudacompiler)) - - if not spec.satisfies('cuda_arch=none'): - cuda_arch = spec.variants['cuda_arch'].value - cuda_arch = "sm_{0}".format(cuda_arch[0]) - flag = '-arch {0}'.format(cuda_arch) - cfg.write(cmake_cache_string("CUDA_ARCH",cuda_arch)) - cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS", flag)) - - else: - cfg.write(cmake_cache_option("ENABLE_CUDA", False)) - - # shared vs static libs - cfg.write(cmake_cache_option("BUILD_SHARED_LIBS","+shared" in spec)) - - umpire_conf_path = spec['umpire'].prefix + "/share/umpire/cmake" - cfg.write(cmake_cache_entry("umpire_DIR",umpire_conf_path)) - - ####################### - # Close and save - ####################### - cfg.write("\n") - cfg.close() - - print("OUT: host-config file {0}".format(host_config_path)) - - def cmake_args(self): - spec = self.spec - host_config_path = self._get_host_config_path(spec) - - options = [] - options.extend(['-C', host_config_path]) - - return options diff --git a/scripts/uberenv/packages/hip/package.py b/scripts/uberenv/packages/hip/package.py deleted file mode 100644 index e63317ec5..000000000 --- a/scripts/uberenv/packages/hip/package.py +++ /dev/null @@ -1,54 +0,0 @@ -from spack import * - - -class Hip(CMakePackage): - """HIP is a C++ Runtime API and Kernel Language that allows developers to - create portable applications for AMD and NVIDIA GPUs from - single source code.""" - - homepage = "https://github.com/ROCm-Developer-Tools/HIP" - url = "https://github.com/ROCm-Developer-Tools/HIP/archive/rocm-3.10.0.tar.gz" - - maintainers = ['srekolam', 'arjun-raj-kuppala'] - - version('3.10.0', sha256='0082c402f890391023acdfd546760f41cb276dffc0ffeddc325999fd2331d4e8') - version('3.9.0', sha256='25ad58691456de7fd9e985629d0ed775ba36a2a0e0b21c086bd96ba2fb0f7ed1') - - depends_on('cmake@3:', type='build') - depends_on('perl@5.10:', type=('build', 'run')) - depends_on('mesa~llvm@18.3:') - - for ver in ['3.9.0', '3.10.0']: - depends_on('rocclr@' + ver, type='build', when='@' + ver) - depends_on('hsakmt-roct@' + ver, type='build', when='@' + ver) - depends_on('hsa-rocr-dev@' + ver, type='link', when='@' + ver) - depends_on('comgr@' + ver, type='build', when='@' + ver) - depends_on('llvm-amdgpu@' + ver, type='build', when='@' + ver) - depends_on('rocm-device-libs@' + ver, type='build', when='@' + ver) - depends_on('rocminfo@' + ver, type='build', when='@' + ver) - - def setup_dependent_package(self, module, dependent_spec): - self.spec.hipcc = join_path(self.prefix.bin, 'hipcc') - - @run_before('install') - def filter_sbang(self): - perl = self.spec['perl'].command - kwargs = {'ignore_absent': False, 'backup': False, 'string': False} - - with working_dir('bin'): - match = '^#!/usr/bin/perl' - substitute = "#!{perl}".format(perl=perl) - files = [ - 'hipify-perl', 'hipcc', 'extractkernel', - 'hipconfig', 'hipify-cmakefile' - ] - filter_file(match, substitute, *files, **kwargs) - - def cmake_args(self): - args = [ - '-DHIP_COMPILER=clang', - '-DHIP_PLATFORM=rocclr', - '-DHSA_PATH={0}'.format(self.spec['hsa-rocr-dev'].prefix), - '-DLIBROCclr_STATIC_DIR={0}/lib'.format(self.spec['rocclr'].prefix) - ] - return args diff --git a/scripts/uberenv/project.json b/scripts/uberenv/project.json deleted file mode 100644 index 1ea0482ce..000000000 --- a/scripts/uberenv/project.json +++ /dev/null @@ -1,10 +0,0 @@ -{ -"package_name" : "raja", -"package_version" : "develop", -"package_final_phase" : "hostconfig", -"package_source_dir" : "../..", -"spack_url": "https://github.com/davidbeckingsale/spack", -"spack_branch": "feature/allow-untested-cuda-versions", -"spack_commit": "f96e256bee1948aa030916aae0c1b2645230fb9f", -"spack_activate" : {} -} diff --git a/scripts/uberenv/spack_configs b/scripts/uberenv/spack_configs deleted file mode 120000 index 17d3bf7a4..000000000 --- a/scripts/uberenv/spack_configs +++ /dev/null @@ -1 +0,0 @@ -../radiuss-spack-configs \ No newline at end of file diff --git a/scripts/uberenv/uberenv.py b/scripts/uberenv/uberenv.py deleted file mode 100755 index 7761be2e9..000000000 --- a/scripts/uberenv/uberenv.py +++ /dev/null @@ -1,800 +0,0 @@ -#!/bin/sh -"exec" "python" "-u" "-B" "$0" "$@" -############################################################################### -# Copyright (c) 2014-2020, Lawrence Livermore National Security, LLC. -# -# Produced at the Lawrence Livermore National Laboratory -# -# LLNL-CODE-666778 -# -# All rights reserved. -# -# This file is part of Conduit. -# -# For details, see https://lc.llnl.gov/conduit/. -# -# Please also read conduit/LICENSE -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the disclaimer below. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the disclaimer (as noted below) in the -# documentation and/or other materials provided with the distribution. -# -# * Neither the name of the LLNS/LLNL nor the names of its contributors may -# be used to endorse or promote products derived from this software without -# specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, -# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY -# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING -# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. -# -############################################################################### - -""" - file: uberenv.py - - description: automates using spack to install a project. - -""" - -import os -import sys -import subprocess -import shutil -import socket -import platform -import json -import datetime -import glob -import re - -from optparse import OptionParser - -from os import environ as env -from os.path import join as pjoin - - -def sexe(cmd,ret_output=False,echo=False): - """ Helper for executing shell commands. """ - if echo: - print("[exe: {}]".format(cmd)) - if ret_output: - p = subprocess.Popen(cmd, - shell=True, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - out = p.communicate()[0] - out = out.decode('utf8') - return p.returncode,out - else: - return subprocess.call(cmd,shell=True) - - -def parse_args(): - "Parses args from command line" - parser = OptionParser() - parser.add_option("--install", - action="store_true", - dest="install", - default=False, - help="Install `package_name`, not just its dependencies.") - - # where to install - parser.add_option("--prefix", - dest="prefix", - default="uberenv_libs", - help="destination directory") - - # what compiler to use - parser.add_option("--spec", - dest="spec", - default=None, - help="spack compiler spec") - - # optional location of spack mirror - parser.add_option("--mirror", - dest="mirror", - default=None, - help="spack mirror directory") - - # flag to create mirror - parser.add_option("--create-mirror", - action="store_true", - dest="create_mirror", - default=False, - help="Create spack mirror") - - # optional location of spack upstream - parser.add_option("--upstream", - dest="upstream", - default=None, - help="add an external spack instance as upstream") - - # this option allows a user to explicitly to select a - # group of spack settings files (compilers.yaml , packages.yaml) - parser.add_option("--spack-config-dir", - dest="spack_config_dir", - default=None, - help="dir with spack settings files (compilers.yaml, packages.yaml, etc)") - - # overrides package_name - parser.add_option("--package-name", - dest="package_name", - default=None, - help="override the default package name") - - # controls after which package phase spack should stop - parser.add_option("--package-final-phase", - dest="package_final_phase", - default=None, - help="override the default phase after which spack should stop") - - # controls source_dir spack should use to build the package - parser.add_option("--package-source-dir", - dest="package_source_dir", - default=None, - help="override the default source dir spack should use") - - # a file that holds settings for a specific project - # using uberenv.py - parser.add_option("--project-json", - dest="project_json", - default=pjoin(uberenv_script_dir(),"project.json"), - help="uberenv project settings json file") - - # flag to use insecure curl + git - parser.add_option("-k", - action="store_true", - dest="ignore_ssl_errors", - default=False, - help="Ignore SSL Errors") - - # option to force a spack pull - parser.add_option("--pull", - action="store_true", - dest="spack_pull", - default=False, - help="Pull if spack repo already exists") - - # option to force for clean of packages specified to - # be cleaned in the project.json - parser.add_option("--clean", - action="store_true", - dest="spack_clean", - default=False, - help="Force uninstall of packages specified in project.json") - - # option to tell spack to run tests - parser.add_option("--run_tests", - action="store_true", - dest="run_tests", - default=False, - help="Invoke build tests during spack install") - - # option to init osx sdk env flags - parser.add_option("--macos-sdk-env-setup", - action="store_true", - dest="macos_sdk_env_setup", - default=False, - help="Set several env vars to select OSX SDK settings." - "This was necessary for older versions of macOS " - " but can cause issues with macOS versions >= 10.13. " - " so it is disabled by default.") - - - ############### - # parse args - ############### - opts, extras = parser.parse_args() - # we want a dict b/c the values could - # be passed without using optparse - opts = vars(opts) - if not opts["spack_config_dir"] is None: - opts["spack_config_dir"] = os.path.abspath(opts["spack_config_dir"]) - if not os.path.isdir(opts["spack_config_dir"]): - print("[ERROR: invalid spack config dir: {} ]".format(opts["spack_config_dir"])) - sys.exit(-1) - # if rel path is given for the mirror, we need to evaluate here -- before any - # chdirs to avoid confusion related to what it is relative to. - # (it should be relative to where uberenv is run from, so it matches what you expect - # from shell completion, etc) - if not opts["mirror"] is None: - if not opts["mirror"].startswith("http") and not os.path.isabs(opts["mirror"]): - opts["mirror"] = os.path.abspath(opts["mirror"]) - return opts, extras - - -def uberenv_script_dir(): - # returns the directory of the uberenv.py script - return os.path.dirname(os.path.abspath(__file__)) - -def load_json_file(json_file): - # reads json file - return json.load(open(json_file)) - -def is_darwin(): - return "darwin" in platform.system().lower() - -def is_windows(): - return "windows" in platform.system().lower() - -class UberEnv(): - """ Base class for package manager """ - - def __init__(self, opts, extra_opts): - self.opts = opts - self.extra_opts = extra_opts - - # load project settings - self.project_opts = load_json_file(opts["project_json"]) - print("[uberenv project settings: {}]".format(str(self.project_opts))) - print("[uberenv options: {}]".format(str(self.opts))) - - def setup_paths_and_dirs(self): - self.uberenv_path = os.path.dirname(os.path.realpath(__file__)) - - def set_from_args_or_json(self,setting): - try: - setting_value = self.project_opts[setting] - except (KeyError): - print("ERROR: {} must at least be defined in project.json".format(setting)) - raise - else: - if self.opts[setting]: - setting_value = self.opts[setting] - return setting_value - - def set_from_json(self,setting): - try: - setting_value = self.project_opts[setting] - except (KeyError): - print("ERROR: {} must at least be defined in project.json".format(setting)) - raise - return setting_value - - def detect_platform(self): - # find supported sets of compilers.yaml, packages,yaml - res = None - if is_darwin(): - res = "darwin" - elif "SYS_TYPE" in os.environ.keys(): - sys_type = os.environ["SYS_TYPE"].lower() - res = sys_type - return res - - -class SpackEnv(UberEnv): - """ Helper to clone spack and install libraries on MacOS an Linux """ - - def __init__(self, opts, extra_opts): - UberEnv.__init__(self,opts,extra_opts) - - self.pkg_name = self.set_from_args_or_json("package_name") - self.pkg_version = self.set_from_json("package_version") - self.pkg_final_phase = self.set_from_args_or_json("package_final_phase") - self.pkg_src_dir = self.set_from_args_or_json("package_source_dir") - - self.spec_hash = "" - self.use_install = False - - # Some additional setup for macos - if is_darwin(): - if opts["macos_sdk_env_setup"]: - # setup osx deployment target and sdk settings - setup_osx_sdk_env_vars() - else: - print("[skipping MACOSX env var setup]") - - # setup default spec - if opts["spec"] is None: - if is_darwin(): - opts["spec"] = "%clang" - else: - opts["spec"] = "%gcc" - self.opts["spec"] = "@{}{}".format(self.pkg_version,opts["spec"]) - elif not opts["spec"].startswith("@"): - self.opts["spec"] = "@{}{}".format(self.pkg_version,opts["spec"]) - else: - self.opts["spec"] = "{}".format(opts["spec"]) - - print("[spack spec: {}]".format(self.opts["spec"])) - - def setup_paths_and_dirs(self): - # get the current working path, and the glob used to identify the - # package files we want to hot-copy to spack - - UberEnv.setup_paths_and_dirs(self) - - self.pkgs = pjoin(self.uberenv_path, "packages","*") - - # setup destination paths - self.dest_dir = os.path.abspath(self.opts["prefix"]) - self.dest_spack = pjoin(self.dest_dir,"spack") - print("[installing to: {0}]".format(self.dest_dir)) - - # print a warning if the dest path already exists - if not os.path.isdir(self.dest_dir): - os.mkdir(self.dest_dir) - else: - print("[info: destination '{}' already exists]".format(self.dest_dir)) - - if os.path.isdir(self.dest_spack): - print("[info: destination '{}' already exists]".format(self.dest_spack)) - - self.pkg_src_dir = os.path.join(self.uberenv_path,self.pkg_src_dir) - if not os.path.isdir(self.pkg_src_dir): - print("[ERROR: package_source_dir '{}' does not exist]".format(self.pkg_src_dir)) - sys.exit(-1) - - - def find_spack_pkg_path_from_hash(self, pkg_name, pkg_hash): - res, out = sexe("spack/bin/spack find -p /{}".format(pkg_hash), ret_output = True) - for l in out.split("\n"): - if l.startswith(pkg_name): - return {"name": pkg_name, "path": l.split()[-1]} - print("[ERROR: failed to find package named '{}']".format(pkg_name)) - sys.exit(-1) - - def find_spack_pkg_path(self, pkg_name, spec = ""): - res, out = sexe("spack/bin/spack find -p " + pkg_name + spec,ret_output = True) - for l in out.split("\n"): - # TODO: at least print a warning when several choices exist. This will - # pick the first in the list. - if l.startswith(pkg_name): - return {"name": pkg_name, "path": l.split()[-1]} - print("[ERROR: failed to find package named '{}']".format(pkg_name)) - sys.exit(-1) - - # Extract the first line of the full spec - def read_spack_full_spec(self,pkg_name,spec): - res, out = sexe("spack/bin/spack spec " + pkg_name + " " + spec, ret_output=True) - for l in out.split("\n"): - if l.startswith(pkg_name) and l.count("@") > 0 and l.count("arch=") > 0: - return l.strip() - - def clone_repo(self): - if not os.path.isdir(self.dest_spack): - - # compose clone command for the dest path, spack url and branch - print("[info: cloning spack develop branch from github]") - - os.chdir(self.dest_dir) - - clone_opts = ("-c http.sslVerify=false " - if self.opts["ignore_ssl_errors"] else "") - - spack_url = self.project_opts.get("spack_url", "https://github.com/spack/spack.git") - spack_branch = self.project_opts.get("spack_branch", "develop") - - clone_cmd = "git {0} clone --single-branch --depth=1 -b {1} {2}".format(clone_opts, spack_branch,spack_url) - sexe(clone_cmd, echo=True) - - if "spack_commit" in self.project_opts: - # optionally, check out a specific commit - os.chdir(pjoin(self.dest_dir,"spack")) - sha1 = self.project_opts["spack_commit"] - res, current_sha1 = sexe("git log -1 --pretty=%H", ret_output=True) - if sha1 != current_sha1: - print("[info: using spack commit {}]".format(sha1)) - sexe("git stash", echo=True) - sexe("git fetch --depth=1 origin {0}".format(sha1),echo=True) - sexe("git checkout {0}".format(sha1),echo=True) - - if self.opts["spack_pull"]: - # do a pull to make sure we have the latest - os.chdir(pjoin(self.dest_dir,"spack")) - sexe("git stash", echo=True) - sexe("git pull", echo=True) - - def config_dir(self): - """ path to compilers.yaml, which we will use for spack's compiler setup""" - spack_config_dir = self.opts["spack_config_dir"] - if spack_config_dir is None: - uberenv_plat = self.detect_platform() - if not uberenv_plat is None: - spack_config_dir = os.path.abspath(pjoin(self.uberenv_path,"spack_configs",uberenv_plat)) - return spack_config_dir - - - def disable_spack_config_scopes(self,spack_dir): - # disables all config scopes except "defaults", which we will - # force our settings into - spack_lib_config = pjoin(spack_dir,"lib","spack","spack","config.py") - print("[disabling config scope (except defaults) in: {}]".format(spack_lib_config)) - cfg_script = open(spack_lib_config).read() - for cfg_scope_stmt in ["('system', os.path.join(spack.paths.system_etc_path, 'spack')),", - "('site', os.path.join(spack.paths.etc_path, 'spack')),", - "('user', spack.paths.user_config_path)"]: - cfg_script = cfg_script.replace(cfg_scope_stmt, - "#DISABLED BY UBERENV: " + cfg_scope_stmt) - open(spack_lib_config,"w").write(cfg_script) - - - def patch(self): - - cfg_dir = self.config_dir() - spack_dir = self.dest_spack - - # force spack to use only "defaults" config scope - self.disable_spack_config_scopes(spack_dir) - spack_etc_defaults_dir = pjoin(spack_dir,"etc","spack","defaults") - - # copy in "defaults" config.yaml - config_yaml = os.path.abspath(pjoin(self.uberenv_path,"spack_configs","config.yaml")) - sexe("cp {} {}/".format(config_yaml, spack_etc_defaults_dir ), echo=True) - - # copy in other settings per platform - if not cfg_dir is None: - print("[copying uberenv compiler and packages settings from {0}]".format(cfg_dir)) - - config_yaml = pjoin(cfg_dir,"config.yaml") - compilers_yaml = pjoin(cfg_dir,"compilers.yaml") - packages_yaml = pjoin(cfg_dir,"packages.yaml") - - if os.path.isfile(config_yaml): - sexe("cp {} {}/".format(config_yaml , spack_etc_defaults_dir ), echo=True) - - if os.path.isfile(compilers_yaml): - sexe("cp {} {}/".format(compilers_yaml, spack_etc_defaults_dir ), echo=True) - - if os.path.isfile(packages_yaml): - sexe("cp {} {}/".format(packages_yaml, spack_etc_defaults_dir ), echo=True) - else: - # let spack try to auto find compilers - sexe("spack/bin/spack compiler find", echo=True) - - # hot-copy our packages into spack - if self.pkgs: - dest_spack_pkgs = pjoin(spack_dir,"var","spack","repos","builtin","packages") - print("[copying patched packages from {0}]".format(self.pkgs)) - sexe("cp -Rf {} {}".format(self.pkgs,dest_spack_pkgs)) - - - def clean_build(self): - # clean out any temporary spack build stages - cln_cmd = "spack/bin/spack clean " - res = sexe(cln_cmd, echo=True) - - # clean out any spack cached stuff - cln_cmd = "spack/bin/spack clean --all" - res = sexe(cln_cmd, echo=True) - - # check if we need to force uninstall of selected packages - if self.opts["spack_clean"]: - if self.project_opts.has_key("spack_clean_packages"): - for cln_pkg in self.project_opts["spack_clean_packages"]: - if not self.find_spack_pkg_path(cln_pkg) is None: - unist_cmd = "spack/bin/spack uninstall -f -y --all --dependents " + cln_pkg - res = sexe(unist_cmd, echo=True) - - def show_info(self): - # prints install status and 32 characters hash - options="--install-status --very-long" - spec_cmd = "spack/bin/spack spec {0} {1}{2}".format(options,self.pkg_name,self.opts["spec"]) - - res, out = sexe(spec_cmd, ret_output=True, echo=True) - print(out) - - #Check if spec is already installed - for line in out.split("\n"): - # Example of matching line: ("status" "hash" "package"...) - # [+] hf3cubkgl74ryc3qwen73kl4yfh2ijgd serac@develop%clang@10.0.0-apple~debug~devtools~glvis arch=darwin-mojave-x86_64 - if re.match(r"^(\[\+\]| - ) [a-z0-9]{32} " + re.escape(self.pkg_name), line): - self.spec_hash = line.split(" ")[1] - # if spec already installed - if line.startswith("[+]"): - pkg_path = self.find_spack_pkg_path_from_hash(self.pkg_name,self.spec_hash) - install_path = pkg_path["path"] - # testing that the path exists is mandatory until Spack team fixes - # https://github.com/spack/spack/issues/16329 - if os.path.isdir(install_path): - print("[Warning: {} {} has already been installed in {}]".format(self.pkg_name, self.opts["spec"],install_path)) - print("[Warning: Uberenv will proceed using this directory]".format(self.pkg_name)) - self.use_install = True - - return res - - def install(self): - # use the uberenv package to trigger the right builds - # and build an host-config.cmake file - - if not self.use_install: - install_cmd = "spack/bin/spack " - if self.opts["ignore_ssl_errors"]: - install_cmd += "-k " - if not self.opts["install"]: - install_cmd += "dev-build --quiet -d {} -u {} ".format(self.pkg_src_dir,self.pkg_final_phase) - else: - install_cmd += "install " - if self.opts["run_tests"]: - install_cmd += "--test=root " - install_cmd += self.pkg_name + self.opts["spec"] - res = sexe(install_cmd, echo=True) - - if res != 0: - print("[ERROR: failure of spack install/dev-build]") - return res - - full_spec = self.read_spack_full_spec(self.pkg_name,self.opts["spec"]) - if "spack_activate" in self.project_opts: - print("[activating dependent packages]") - # get the full spack spec for our project - pkg_names = self.project_opts["spack_activate"].keys() - for pkg_name in pkg_names: - pkg_spec_requirements = self.project_opts["spack_activate"][pkg_name] - activate=True - for req in pkg_spec_requirements: - if req not in full_spec: - activate=False - break - if activate: - activate_cmd = "spack/bin/spack activate " + pkg_name - sexe(activate_cmd, echo=True) - # note: this assumes package extends python when +python - # this may fail general cases - if self.opts["install"] and "+python" in full_spec: - activate_cmd = "spack/bin/spack activate /" + self.spec_hash - sexe(activate_cmd, echo=True) - # if user opt'd for an install, we want to symlink the final - # install to an easy place: - if self.opts["install"] or self.use_install: - pkg_path = self.find_spack_pkg_path_from_hash(self.pkg_name, self.spec_hash) - if self.pkg_name != pkg_path["name"]: - print("[ERROR: Could not find install of {}]".format(self.pkg_name)) - return -1 - else: - # Symlink host-config file - hc_glob = glob.glob(pjoin(pkg_path["path"],"*.cmake")) - if len(hc_glob) > 0: - hc_path = hc_glob[0] - hc_fname = os.path.split(hc_path)[1] - if os.path.islink(hc_fname): - os.unlink(hc_fname) - elif os.path.isfile(hc_fname): - sexe("rm -f {}".format(hc_fname)) - print("[symlinking host config file to {}]".format(pjoin(self.dest_dir,hc_fname))) - os.symlink(hc_path,hc_fname) - - # Symlink install directory - if self.opts["install"]: - pkg_lnk_dir = "{}-install".format(self.pkg_name) - if os.path.islink(pkg_lnk_dir): - os.unlink(pkg_lnk_dir) - print("") - print("[symlinking install to {}]".format(pjoin(self.dest_dir,pkg_lnk_dir))) - os.symlink(pkg_path["path"],os.path.abspath(pkg_lnk_dir)) - print("") - print("[install complete!]") - # otherwise we are in the "only dependencies" case and the host-config - # file has to be copied from the do-be-deleted spack-build dir. - else: - pattern = "*{}.cmake".format(self.pkg_name) - build_dir = pjoin(self.pkg_src_dir,"spack-build") - hc_glob = glob.glob(pjoin(build_dir,pattern)) - if len(hc_glob) > 0: - hc_path = hc_glob[0] - hc_fname = os.path.split(hc_path)[1] - if os.path.islink(hc_fname): - os.unlink(hc_fname) - print("[copying host config file to {}]".format(pjoin(self.dest_dir,hc_fname))) - sexe("cp {} {}".format(hc_path,hc_fname)) - print("[removing project build directory {}]".format(pjoin(build_dir))) - sexe("rm -rf {}".format(build_dir)) - - def get_mirror_path(self): - mirror_path = self.opts["mirror"] - if not mirror_path: - print("[--create-mirror requires a mirror directory]") - sys.exit(-1) - return mirror_path - - def create_mirror(self): - """ - Creates a spack mirror for pkg_name at mirror_path. - """ - - mirror_path = self.get_mirror_path() - - mirror_cmd = "spack/bin/spack " - if self.opts["ignore_ssl_errors"]: - mirror_cmd += "-k " - mirror_cmd += "mirror create -d {} --dependencies {}{}".format(mirror_path, - self.pkg_name, - self.opts["spec"]) - return sexe(mirror_cmd, echo=True) - - def find_spack_mirror(self, mirror_name): - """ - Returns the path of a defaults scoped spack mirror with the - given name, or None if no mirror exists. - """ - res, out = sexe("spack/bin/spack mirror list", ret_output=True) - mirror_path = None - for mirror in out.split('\n'): - if mirror: - parts = mirror.split() - if parts[0] == mirror_name: - mirror_path = parts[1] - return mirror_path - - def use_mirror(self): - """ - Configures spack to use mirror at a given path. - """ - mirror_name = self.pkg_name - mirror_path = self.get_mirror_path() - existing_mirror_path = self.find_spack_mirror(mirror_name) - - if existing_mirror_path and mirror_path != existing_mirror_path: - # Existing mirror has different URL, error out - print("[removing existing spack mirror `{}` @ {}]".format(mirror_name, - existing_mirror_path)) - # - # Note: In this case, spack says it removes the mirror, but we still - # get errors when we try to add a new one, sounds like a bug - # - sexe("spack/bin/spack mirror remove --scope=defaults {} ".format(mirror_name), - echo=True) - existing_mirror_path = None - if not existing_mirror_path: - # Add if not already there - sexe("spack/bin/spack mirror add --scope=defaults {} {}".format( - mirror_name, mirror_path), echo=True) - print("[using mirror {}]".format(mirror_path)) - - def find_spack_upstream(self, upstream_name): - """ - Returns the path of a defaults scoped spack upstream with the - given name, or None if no upstream exists. - """ - upstream_path = None - - res, out = sexe('spack/bin/spack config get upstreams', ret_output=True) - if (not out) and ("upstreams:" in out): - out = out.replace(' ', '') - out = out.replace('install_tree:', '') - out = out.replace(':', '') - out = out.splitlines() - out = out[1:] - upstreams = dict(zip(out[::2], out[1::2])) - - for name in upstreams.keys(): - if name == upstream_name: - upstream_path = upstreams[name] - - return upstream_path - - def use_spack_upstream(self): - """ - Configures spack to use upstream at a given path. - """ - upstream_path = self.opts["upstream"] - if not upstream_path: - print("[--create-upstream requires a upstream directory]") - sys.exit(-1) - upstream_path = os.path.abspath(upstream_path) - upstream_name = self.pkg_name - existing_upstream_path = self.find_spack_upstream(upstream_name) - if (not existing_upstream_path) or (upstream_path != os.path.abspath(existing_upstream_path)): - # Existing upstream has different URL, error out - print("[removing existing spack upstream configuration file]") - sexe("rm spack/etc/spack/defaults/upstreams.yaml") - with open('spack/etc/spack/defaults/upstreams.yaml','w+') as upstreams_cfg_file: - upstreams_cfg_file.write("upstreams:\n") - upstreams_cfg_file.write(" {}:\n".format(upstream_name)) - upstreams_cfg_file.write(" install_tree: {}\n".format(upstream_path)) - - -def find_osx_sdks(): - """ - Finds installed osx sdks, returns dict mapping version to file system path - """ - res = {} - sdks = glob.glob("/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX*.sdk") - for sdk in sdks: - sdk_base = os.path.split(sdk)[1] - ver = sdk_base[len("MacOSX"):sdk_base.rfind(".")] - res[ver] = sdk - return res - -def setup_osx_sdk_env_vars(): - """ - Finds installed osx sdks, returns dict mapping version to file system path - """ - # find current osx version (10.11.6) - dep_tgt = platform.mac_ver()[0] - # sdk file names use short version (ex: 10.11) - dep_tgt_short = dep_tgt[:dep_tgt.rfind(".")] - # find installed sdks, ideally we want the sdk that matches the current os - sdk_root = None - sdks = find_osx_sdks() - if dep_tgt_short in sdks.keys(): - # matches our osx, use this one - sdk_root = sdks[dep_tgt_short] - elif len(sdks) > 0: - # for now, choose first one: - dep_tgt = sdks.keys()[0] - sdk_root = sdks[dep_tgt] - else: - # no valid sdks, error out - print("[ERROR: Could not find OSX SDK @ /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/]") - sys.exit(-1) - - env["MACOSX_DEPLOYMENT_TARGET"] = dep_tgt - env["SDKROOT"] = sdk_root - print("[setting MACOSX_DEPLOYMENT_TARGET to {}]".format(env["MACOSX_DEPLOYMENT_TARGET"])) - print("[setting SDKROOT to {}]".format(env[ "SDKROOT"])) - - - -def main(): - """ - Clones and runs a package manager to setup third_party libs. - Also creates a host-config.cmake file that can be used by our project. - """ - - # parse args from command line - opts, extra_opts = parse_args() - - # Initialize the environment - env = SpackEnv(opts, extra_opts) - - # Setup the necessary paths and directories - env.setup_paths_and_dirs() - - # Clone the package manager - env.clone_repo() - - os.chdir(env.dest_dir) - - # Patch the package manager, as necessary - env.patch() - - # Clean the build - env.clean_build() - - # Show the spec for what will be built - env.show_info() - - - ########################################################## - # we now have an instance of spack configured how we - # need it to build our tpls at this point there are two - # possible next steps: - # - # *) create a mirror of the packages - # OR - # *) build - # - ########################################################## - if opts["create_mirror"]: - return env.create_mirror() - else: - if not opts["mirror"] is None: - env.use_mirror() - - if not opts["upstream"] is None: - env.use_spack_upstream() - - res = env.install() - - return res - -if __name__ == "__main__": - sys.exit(main()) - - From bd799656cc235870bf0dec86ff150b07b5a6f4eb Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Mon, 19 Jul 2021 15:47:25 -0700 Subject: [PATCH 028/392] changing name in package.py to RajaPerf --- scripts/spack_packages/raja_perf/package.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/spack_packages/raja_perf/package.py b/scripts/spack_packages/raja_perf/package.py index 00ac78959..046eb73de 100644 --- a/scripts/spack_packages/raja_perf/package.py +++ b/scripts/spack_packages/raja_perf/package.py @@ -51,11 +51,11 @@ def get_spec_path(spec, package_name, path_replacements = {}, use_bin = False) : return path -class Raja(CMakePackage, CudaPackage): - """RAJA Parallel Framework.""" +class RajaPerf(CMakePackage, CudaPackage): + """RAJAPerf Suite Framework.""" - homepage = "http://software.llnl.gov/RAJA/" - git = "https://github.com/LLNL/RAJA.git" + homepage = "http://software.llnl.gov/RAJAPerf/" + git = "https://github.com/LLNL/RAJAPerf.git" version('develop', branch='develop', submodules='True') version('main', branch='main', submodules='True') From daa9c27afea6e261a22eb6d756128aad24c3d838 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Tue, 20 Jul 2021 12:54:12 -0700 Subject: [PATCH 029/392] adding a uberenv config file --- .uberenv_config.json | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 .uberenv_config.json diff --git a/.uberenv_config.json b/.uberenv_config.json new file mode 100644 index 000000000..701dbb794 --- /dev/null +++ b/.uberenv_config.json @@ -0,0 +1,12 @@ +{ +"package_name" : "raja-perf", +"package_version" : "develop", +"package_final_phase" : "hostconfig", +"package_source_dir" : "../..", +"spack_url": "https://github.com/davidbeckingsale/spack", +"spack_branch": "feature/allow-untested-cuda-versions", +"spack_commit": "f96e256bee1948aa030916aae0c1b2645230fb9f", +"spack_activate" : {}, +"spack_configs_path": "scripts/radiuss-spack-configs", +"spack_packages_path": "scripts/spack_packages" +} From 49ae6ebb404d4423b0ad109e6c9ba886c53f0a04 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Tue, 20 Jul 2021 15:01:36 -0700 Subject: [PATCH 030/392] fixed the typo in the package name --- .uberenv_config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.uberenv_config.json b/.uberenv_config.json index 701dbb794..5596d5a8b 100644 --- a/.uberenv_config.json +++ b/.uberenv_config.json @@ -1,5 +1,5 @@ { -"package_name" : "raja-perf", +"package_name" : "raja_perf", "package_version" : "develop", "package_final_phase" : "hostconfig", "package_source_dir" : "../..", From 36436f60c728766176a0864605065ccb2203b5e2 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Wed, 21 Jul 2021 08:17:34 -0700 Subject: [PATCH 031/392] adding the hip package --- scripts/spack_packages/hip/package.py | 55 +++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 scripts/spack_packages/hip/package.py diff --git a/scripts/spack_packages/hip/package.py b/scripts/spack_packages/hip/package.py new file mode 100644 index 000000000..2849fb761 --- /dev/null +++ b/scripts/spack_packages/hip/package.py @@ -0,0 +1,55 @@ +from spack import * + + +class Hip(CMakePackage): + """HIP is a C++ Runtime API and Kernel Language that allows developers to + create portable applications for AMD and NVIDIA GPUs from + single source code.""" + + homepage = "https://github.com/ROCm-Developer-Tools/HIP" + url = "https://github.com/ROCm-Developer-Tools/HIP/archive/rocm-3.10.0.tar.gz" + + maintainers = ['srekolam', 'arjun-raj-kuppala'] + + version('3.10.0', sha256='0082c402f890391023acdfd546760f41cb276dffc0ffeddc325999fd2331d4e8') + version('3.9.0', sha256='25ad58691456de7fd9e985629d0ed775ba36a2a0e0b21c086bd96ba2fb0f7ed1') + + depends_on('cmake@3:', type='build') + depends_on('perl@5.10:', type=('build', 'run')) + depends_on('mesa~llvm@18.3:') + + for ver in ['3.9.0', '3.10.0']: + depends_on('rocclr@' + ver, type='build', when='@' + ver) + depends_on('hsakmt-roct@' + ver, type='build', when='@' + ver) + depends_on('hsa-rocr-dev@' + ver, type='link', when='@' + ver) + depends_on('comgr@' + ver, type='build', when='@' + ver) + depends_on('llvm-amdgpu@' + ver, type='build', when='@' + ver) + depends_on('rocm-device-libs@' + ver, type='build', when='@' + ver) + depends_on('rocminfo@' + ver, type='build', when='@' + ver) + + def setup_dependent_package(self, module, dependent_spec): + self.spec.hipcc = join_path(self.prefix.bin, 'hipcc') + + @run_before('install') + def filter_sbang(self): + perl = self.spec['perl'].command + kwargs = {'ignore_absent': False, 'backup': False, 'string': False} + + with working_dir('bin'): + match = '^#!/usr/bin/perl' + substitute = "#!{perl}".format(perl=perl) + files = [ + 'hipify-perl', 'hipcc', 'extractkernel', + 'hipconfig', 'hipify-cmakefile' + ] + filter_file(match, substitute, *files, **kwargs) + + def cmake_args(self): + args = [ + '-DHIP_COMPILER=clang', + '-DHIP_PLATFORM=rocclr', + '-DHSA_PATH={0}'.format(self.spec['hsa-rocr-dev'].prefix), + '-DLIBROCclr_STATIC_DIR={0}/lib'.format(self.spec['rocclr'].prefix) + ] + return args + From d08dfb234b7b7fb84936e6c135addcb1669c9727 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Mon, 26 Jul 2021 13:26:54 -0700 Subject: [PATCH 032/392] trying to use quartz instead of ruby for now --- .gitlab-ci.yml | 14 ++++---- .gitlab/{ruby-jobs.yml => quartz-jobs.yml} | 14 ++++---- ...uby-templates.yml => quartz-templates.yml} | 32 +++++++++---------- scripts/gitlab/build_and_test.sh | 2 +- 4 files changed, 31 insertions(+), 31 deletions(-) rename .gitlab/{ruby-jobs.yml => quartz-jobs.yml} (76%) rename .gitlab/{ruby-templates.yml => quartz-templates.yml} (64%) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4794d752e..a0142380a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -18,7 +18,7 @@ # Tells Gitlab to recursively update the submodules when cloning umpire # ALLOC_NAME: -# On LLNL's ruby, this pipeline creates only one allocation shared among jobs +# On LLNL's quartz, this pipeline creates only one allocation shared among jobs # in order to save time and resources. This allocation has to be uniquely named # so that we are sure to retrieve it. @@ -42,12 +42,12 @@ variables: # Normally, stages are blocking in Gitlab. However, using the keyword "needs" we # can express dependencies between job that break the ordering of stages, in # favor of a DAG. -# In practice r_*, l_* and b_* stages are independently run and start immediately. +# In practice q_*, l_* and b_* stages are independently run and start immediately. stages: - - r_allocate_resources - - r_build_and_test - - r_release_resources + - q_allocate_resources + - q_build_and_test + - q_release_resources - l_build_and_test - b_build_and_test - c_build_and_test @@ -105,8 +105,8 @@ trigger-chai: # This is where jobs are included. include: - - local: .gitlab/ruby-templates.yml - - local: .gitlab/ruby-jobs.yml + - local: .gitlab/quartz-templates.yml + - local: .gitlab/quartz-jobs.yml - local: .gitlab/lassen-templates.yml - local: .gitlab/lassen-jobs.yml - local: .gitlab/corona-templates.yml diff --git a/.gitlab/ruby-jobs.yml b/.gitlab/quartz-jobs.yml similarity index 76% rename from .gitlab/ruby-jobs.yml rename to .gitlab/quartz-jobs.yml index 2109043d4..ed6fb10cb 100644 --- a/.gitlab/ruby-jobs.yml +++ b/.gitlab/quartz-jobs.yml @@ -8,36 +8,36 @@ clang_10: variables: SPEC: "%clang@10.0.1" - extends: .build_and_test_on_ruby + extends: .build_and_test_on_quartz clang_9: variables: SPEC: "%clang@9.0.0" - extends: .build_and_test_on_ruby + extends: .build_and_test_on_quartz gcc_8_1_0: variables: SPEC: "%gcc@8.1.0" DEFAULT_TIME: 60 - extends: .build_and_test_on_ruby + extends: .build_and_test_on_quartz icpc_17_0_2: variables: SPEC: "%intel@17.0.2" DEFAULT_TIME: 40 - extends: .build_and_test_on_ruby + extends: .build_and_test_on_quartz icpc_18_0_2: variables: SPEC: " tests=none %intel@18.0.2" DEFAULT_TIME: 40 - extends: .build_and_test_on_ruby + extends: .build_and_test_on_quartz icpc_19_1_0: variables: SPEC: "%intel@19.1.0" DEFAULT_TIME: 40 - extends: .build_and_test_on_ruby + extends: .build_and_test_on_quartz # EXTRAS @@ -45,4 +45,4 @@ gcc_4_9_3: variables: SPEC: "%gcc@4.9.3" DEFAULT_TIME: 60 - extends: .build_and_test_on_ruby + extends: .build_and_test_on_quartz diff --git a/.gitlab/ruby-templates.yml b/.gitlab/quartz-templates.yml similarity index 64% rename from .gitlab/ruby-templates.yml rename to .gitlab/quartz-templates.yml index ae4079c67..676d972b7 100644 --- a/.gitlab/ruby-templates.yml +++ b/.gitlab/quartz-templates.yml @@ -6,16 +6,16 @@ ############################################################################### #### -# This is the shared configuration of jobs for ruby +# This is the shared configuration of jobs for quartz #### # In pre-build phase, allocate a node for builds -.on_ruby: +.on_quartz: tags: - shell - - ruby + - quartz rules: - - if: '$CI_COMMIT_BRANCH =~ /_qnone/ || $ON_RUBY == "OFF"' #run except if ... + - if: '$CI_COMMIT_BRANCH =~ /_qnone/ || $ON_QUARTZ == "OFF"' #run except if ... when: never - if: '$CI_JOB_NAME =~ /release_resources/' when: always @@ -24,31 +24,31 @@ ### # In pre-build phase, allocate a node for builds # NOTE: Not specifying 'salloc -c 56' should allocate the max number of CPU cores -allocate_resources (on ruby): +allocate_resources (on quartz): variables: GIT_STRATEGY: none - extends: .on_ruby - stage: r_allocate_resources + extends: .on_quartz + stage: q_allocate_resources script: - salloc -N 1 -p pdebug -t 45 --no-shell --job-name=${ALLOC_NAME} ### # In post-build phase, deallocate resources # Note : make sure this is run even on build phase failure -release_resources (on ruby): +release_resources (on quartz): variables: GIT_STRATEGY: none - extends: .on_ruby - stage: r_release_resources + extends: .on_quartz + stage: q_release_resources script: - export JOBID=$(squeue -h --name=${ALLOC_NAME} --format=%A) - ([[ -n "${JOBID}" ]] && scancel ${JOBID}) ### -# Generic ruby build job, extending build script -.build_and_test_on_ruby: - extends: [.build_toss_3_x86_64_ib_script, .on_ruby] - stage: r_build_and_test +# Generic quartz build job, extending build script +.build_and_test_on_quartz: + extends: [.build_toss_3_x86_64_ib_script, .on_quartz] + stage: q_build_and_test -.build_and_test_on_ruby_advanced: - extends: [.build_and_test_on_ruby, .advanced_pipeline] +.build_and_test_on_quartz_advanced: + extends: [.build_and_test_on_quartz, .advanced_pipeline] diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index b880ffa97..ba99e3b39 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -113,7 +113,7 @@ then echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" # Map CPU core allocations - declare -A core_counts=(["lassen"]=40 ["ruby"]=28 ["corona"]=32) + declare -A core_counts=(["lassen"]=40 ["quartz"]=28 ["corona"]=32) # If building, then delete everything first # NOTE: 'cmake --build . -j core_counts' attempts to reduce individual build resources. From d47f9ae8b60f9861268e83c30dc063c9a3be29ce Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Mon, 26 Jul 2021 13:48:09 -0700 Subject: [PATCH 033/392] configuring the rajaperf suite runs --- scripts/gitlab/build_and_test.sh | 33 ++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index ba99e3b39..963f0bcac 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -129,3 +129,36 @@ then cmake --build . -j ${core_counts[$truehostname]} date fi + +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ RUNNING RAJAPERF SUITE" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +if [[ ! -d ${build_dir} ]] +then + echo "ERROR: Build directory not found : ${build_dir}" && exit 1 +fi + +cd ${build_dir} +if [[grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} ]] +then + ./bin/raja-perf.exe -sp +else + ./bin/raja-perf.exe --checkrun -sp +fi + +echo "Copying Testing xml reports for export" +tree Testing +xsltproc -o junit.xml ${project_dir}/blt/tests/ctest-to-junit.xsl Testing/*/Test.xml +mv junit.xml ${project_dir}/junit.xml + +if grep -q "Errors while running CTest" ./tests_output.txt +then + echo "ERROR: failure(s) while running CTest" && exit 1 +fi + +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ CLEAN UP" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +make clean + From d7e32b92a00492a73c33e17ebff426e33e224dcd Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Mon, 26 Jul 2021 14:17:00 -0700 Subject: [PATCH 034/392] fixing gitlab build and run script --- scripts/gitlab/build_and_test.sh | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 963f0bcac..f6cbe1ad6 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -147,16 +147,6 @@ else ./bin/raja-perf.exe --checkrun -sp fi -echo "Copying Testing xml reports for export" -tree Testing -xsltproc -o junit.xml ${project_dir}/blt/tests/ctest-to-junit.xsl Testing/*/Test.xml -mv junit.xml ${project_dir}/junit.xml - -if grep -q "Errors while running CTest" ./tests_output.txt -then - echo "ERROR: failure(s) while running CTest" && exit 1 -fi - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "~~~~~ CLEAN UP" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" From 6b1fcec21595856e622835a1c13c10bb6ab882b0 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Mon, 26 Jul 2021 15:06:46 -0700 Subject: [PATCH 035/392] fixing grep typo --- scripts/gitlab/build_and_test.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index f6cbe1ad6..0c631696c 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -140,7 +140,8 @@ then fi cd ${build_dir} -if [[grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} ]] + +if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} then ./bin/raja-perf.exe -sp else From 1a9f8e4431a81490b5bba930885672d7da599b10 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Tue, 27 Jul 2021 08:25:15 -0700 Subject: [PATCH 036/392] updating hip package file --- scripts/spack_packages/hip/package.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/spack_packages/hip/package.py b/scripts/spack_packages/hip/package.py index 2849fb761..f99d26dc6 100644 --- a/scripts/spack_packages/hip/package.py +++ b/scripts/spack_packages/hip/package.py @@ -7,18 +7,18 @@ class Hip(CMakePackage): single source code.""" homepage = "https://github.com/ROCm-Developer-Tools/HIP" - url = "https://github.com/ROCm-Developer-Tools/HIP/archive/rocm-3.10.0.tar.gz" + url = "https://github.com/ROCm-Developer-Tools/HIP/archive/refs/tags/rocm-4.0.0.tar.gz" maintainers = ['srekolam', 'arjun-raj-kuppala'] - version('3.10.0', sha256='0082c402f890391023acdfd546760f41cb276dffc0ffeddc325999fd2331d4e8') - version('3.9.0', sha256='25ad58691456de7fd9e985629d0ed775ba36a2a0e0b21c086bd96ba2fb0f7ed1') + version('4.1.0', sha256='25ad58691456de7fd9e985629d0ed775ba36a2a0e0b21c086bd96ba2fb0f7ed1') + version('4.0.0', sha256='0082c402f890391023acdfd546760f41cb276dffc0ffeddc325999fd2331d4e8') depends_on('cmake@3:', type='build') depends_on('perl@5.10:', type=('build', 'run')) depends_on('mesa~llvm@18.3:') - for ver in ['3.9.0', '3.10.0']: + for ver in ['4.0.0', '4.1.0']: depends_on('rocclr@' + ver, type='build', when='@' + ver) depends_on('hsakmt-roct@' + ver, type='build', when='@' + ver) depends_on('hsa-rocr-dev@' + ver, type='link', when='@' + ver) From 081f04306c50aa1268aaff683f7a3a248120808a Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Tue, 3 Aug 2021 16:10:12 -0700 Subject: [PATCH 037/392] skipping the tests for xl+cuda jobs --- .gitlab/lassen-jobs.yml | 4 ++-- scripts/gitlab/build_and_test.sh | 12 ++++++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index 95e20a891..673c69f24 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -57,14 +57,14 @@ gcc_8_3_1_cuda: xl_16_1_1_7_cuda: variables: - SPEC: "+cuda %xl@16.1.1.7 cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5" + SPEC: "+cuda %xl@16.1.1.7 tests=none cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5" DEFAULT_TIME: 60 allow_failure: true extends: .build_and_test_on_lassen xl_16_1_1_7_gcc_8_3_1_cuda_11: variables: - SPEC: "+cuda %xl@16.1.1.7 cuda_arch=70 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@11.0.2 ^cmake@3.14.5" + SPEC: "+cuda %xl@16.1.1.7 tests=none cuda_arch=70 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@11.0.2 ^cmake@3.14.5" DEFAULT_TIME: 60 allow_failure: true extends: .build_and_test_on_lassen diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 0c631696c..8f07e38b1 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -141,11 +141,15 @@ fi cd ${build_dir} -if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} +if grep -q -i "ENABLE_TESTS.*ON" ${hostconfig_path} then - ./bin/raja-perf.exe -sp -else - ./bin/raja-perf.exe --checkrun -sp + + if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} + then + ./bin/raja-perf.exe -sp + else + ./bin/raja-perf.exe --checkrun -sp + fi fi echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" From dac4a999da84e86a14ce389b462e2e188404bba6 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Wed, 4 Aug 2021 13:53:22 -0700 Subject: [PATCH 038/392] updating ci jobs to clang 11 and removing icpc 18 job --- .gitlab/lassen-jobs.yml | 24 ++++++++++++------------ .gitlab/quartz-jobs.yml | 6 ------ scripts/radiuss-spack-configs | 2 +- 3 files changed, 13 insertions(+), 19 deletions(-) diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index 673c69f24..7332baebd 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -9,14 +9,14 @@ ## CPU ONLY ########### -ibm_clang_9: +ibm_clang_11: variables: - SPEC: "%clang@9.0.0ibm" + SPEC: "%clang@11.0.0ibm" extends: .build_and_test_on_lassen -ibm_clang_9_gcc_8: +ibm_clang_11_gcc_8: variables: - SPEC: "%clang@9.0.0ibm cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" + SPEC: "%clang@11.0.0ibm cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" extends: .build_and_test_on_lassen gcc_8_3_1: @@ -40,14 +40,14 @@ xl_16_1_1_7_gcc_8_3_1: ## CUDA ########### -ibm_clang_9_cuda: +ibm_clang_11_cuda: variables: - SPEC: "+cuda+allow-untested-versions cuda_arch=70 %clang@9.0.0ibm ^cuda@10.1.168" + SPEC: "+cuda+allow-untested-versions cuda_arch=70 %clang@11.0.0ibm ^cuda@10.1.168" extends: .build_and_test_on_lassen -ibm_clang_9_gcc_8_cuda: +ibm_clang_11_gcc_8_cuda: variables: - SPEC: "+cuda %clang@9.0.0ibm cuda_arch=70 +allow-untested-versions cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@10.1.168" + SPEC: "+cuda %clang@11.0.0ibm cuda_arch=70 +allow-untested-versions cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@10.1.168" extends: .build_and_test_on_lassen gcc_8_3_1_cuda: @@ -73,13 +73,13 @@ xl_16_1_1_7_gcc_8_3_1_cuda_11: ## EXTRAS ########### -clang_9_0_0_libcpp (build and test on lassen): +clang_11_0_0_libcpp (build and test on lassen): variables: - SPEC: "%clang@9.0.0+libcpp" + SPEC: "%clang@11.0.0+libcpp" extends: .build_and_test_on_lassen -clang_9_0_0_memleak (build and test on lassen): +clang_11_0_0_memleak (build and test on lassen): variables: - SPEC: "%clang@9.0.0 cxxflags=-fsanitize=address" + SPEC: "%clang@11.0.0 cxxflags=-fsanitize=address" ASAN_OPTIONS: "detect_leaks=1" extends: .build_and_test_on_lassen diff --git a/.gitlab/quartz-jobs.yml b/.gitlab/quartz-jobs.yml index ed6fb10cb..a1159c5b3 100644 --- a/.gitlab/quartz-jobs.yml +++ b/.gitlab/quartz-jobs.yml @@ -27,12 +27,6 @@ icpc_17_0_2: DEFAULT_TIME: 40 extends: .build_and_test_on_quartz -icpc_18_0_2: - variables: - SPEC: " tests=none %intel@18.0.2" - DEFAULT_TIME: 40 - extends: .build_and_test_on_quartz - icpc_19_1_0: variables: SPEC: "%intel@19.1.0" diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index 292b30f98..b6aa61044 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit 292b30f981d325bbbba069d552bf4febdfdce938 +Subproject commit b6aa61044597ec2bd81f2eb70895ef277587b3af From 54f415a319643a02c5cef7d71d514a0c9ff6392e Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Wed, 4 Aug 2021 14:05:38 -0700 Subject: [PATCH 039/392] attempting to change the cmake build command for icpc jobs --- scripts/gitlab/build_and_test.sh | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 8f07e38b1..00edaab47 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -126,7 +126,18 @@ then cmake \ -C ${hostconfig_path} \ ${project_dir} - cmake --build . -j ${core_counts[$truehostname]} + if grep -q -i "icpc" ${spec} + then + cmake --build . -j 16 + echo "~~~~~~~~~ Build Command: ~~~~~~~~~~~~~~~~~~~~~" + echo "cmake --build . -j 16" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + else + cmake --build . -j ${core_counts[$truehostname]} + echo "~~~~~~~~~ Build Command: ~~~~~~~~~~~~~~~~~~~~~" + echo "cmake --build . -j ${core_counts[$truehostname]}" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + fi date fi From e36be4b477ca77e9fb17fde3f538c55d6c54ecf0 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Thu, 5 Aug 2021 09:52:47 -0700 Subject: [PATCH 040/392] updating radiuss config for clang11 and uberenv spack version --- .gitlab/lassen-jobs.yml | 4 ++-- .uberenv_config.json | 2 +- scripts/radiuss-spack-configs | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index 7332baebd..cfee9404d 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -42,12 +42,12 @@ xl_16_1_1_7_gcc_8_3_1: ibm_clang_11_cuda: variables: - SPEC: "+cuda+allow-untested-versions cuda_arch=70 %clang@11.0.0ibm ^cuda@10.1.168" + SPEC: "+cuda cuda_arch=70 %clang@11.0.0ibm ^cuda@10.1.168" extends: .build_and_test_on_lassen ibm_clang_11_gcc_8_cuda: variables: - SPEC: "+cuda %clang@11.0.0ibm cuda_arch=70 +allow-untested-versions cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@10.1.168" + SPEC: "+cuda %clang@11.0.0ibm cuda_arch=70 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@10.1.168" extends: .build_and_test_on_lassen gcc_8_3_1_cuda: diff --git a/.uberenv_config.json b/.uberenv_config.json index 5596d5a8b..c60de2b4b 100644 --- a/.uberenv_config.json +++ b/.uberenv_config.json @@ -5,7 +5,7 @@ "package_source_dir" : "../..", "spack_url": "https://github.com/davidbeckingsale/spack", "spack_branch": "feature/allow-untested-cuda-versions", -"spack_commit": "f96e256bee1948aa030916aae0c1b2645230fb9f", +"spack_commit": "46b22d0f6227f6b12bab712bda5b916a53cfc67d", "spack_activate" : {}, "spack_configs_path": "scripts/radiuss-spack-configs", "spack_packages_path": "scripts/spack_packages" diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index b6aa61044..a35701f59 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit b6aa61044597ec2bd81f2eb70895ef277587b3af +Subproject commit a35701f59b31976b673b312b0f7959a441087135 From 4a105531a538993a657994e5d40503ac3c86f92b Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Fri, 6 Aug 2021 13:15:23 -0700 Subject: [PATCH 041/392] updating intel jobs, trying to add flag to lassen jobs --- scripts/gitlab/build_and_test.sh | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 00edaab47..c6cf0bca8 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -126,7 +126,7 @@ then cmake \ -C ${hostconfig_path} \ ${project_dir} - if grep -q -i "icpc" ${spec} + if grep -q -i "intel" ${spec} then cmake --build . -j 16 echo "~~~~~~~~~ Build Command: ~~~~~~~~~~~~~~~~~~~~~" @@ -154,12 +154,20 @@ cd ${build_dir} if grep -q -i "ENABLE_TESTS.*ON" ${hostconfig_path} then - - if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} - then - ./bin/raja-perf.exe -sp + if [[ ${truehostname} == "lassen" ]] + if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} + then + ./bin/raja-perf.exe --smpiargs="-disable_gpu_hooks" -sp + else + ./bin/raja-perf.exe --smpiargs="-disable_gpu_hooks" --checkrun -sp + fi else - ./bin/raja-perf.exe --checkrun -sp + if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} + then + ./bin/raja-perf.exe -sp + else + ./bin/raja-perf.exe --checkrun -sp + fi fi fi From 5e726405b93114bf2b42b8b7f711620f22cb24bb Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Fri, 6 Aug 2021 13:17:13 -0700 Subject: [PATCH 042/392] taking out junit stuff in ci script --- .gitlab-ci.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a0142380a..1b194bbc7 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -68,9 +68,6 @@ stages: - export JOBID=$(squeue -h --name=${ALLOC_NAME} --format=%A) - echo ${JOBID} - srun $( [[ -n "${JOBID}" ]] && echo "--jobid=${JOBID}" ) -t ${DEFAULT_TIME} -N 1 scripts/gitlab/build_and_test.sh - artifacts: - reports: - junit: junit.xml .build_toss_3_x86_64_ib_corona_script: script: @@ -81,9 +78,6 @@ stages: .build_blueos_3_ppc64le_ib_script: script: - lalloc 1 -W ${DEFAULT_TIME} scripts/gitlab/build_and_test.sh - artifacts: - reports: - junit: junit.xml .build_blueos_3_ppc64le_ib_p9_script: extends: .build_blueos_3_ppc64le_ib_script From 6240a62c4f00d4084e24f9e3680183deb71903ee Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Fri, 6 Aug 2021 13:45:23 -0700 Subject: [PATCH 043/392] fixing typo error --- .gitlab-ci.yml | 2 +- scripts/gitlab/build_and_test.sh | 17 ++++------------- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1b194bbc7..0b84a392a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -77,7 +77,7 @@ stages: # allow pre-allocation the same way slurm does. .build_blueos_3_ppc64le_ib_script: script: - - lalloc 1 -W ${DEFAULT_TIME} scripts/gitlab/build_and_test.sh + - lalloc 1 -W ${DEFAULT_TIME} $(echo "--smpiargs='-disable_gpu_hooks'") scripts/gitlab/build_and_test.sh .build_blueos_3_ppc64le_ib_p9_script: extends: .build_blueos_3_ppc64le_ib_script diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index c6cf0bca8..ceea8a172 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -154,20 +154,11 @@ cd ${build_dir} if grep -q -i "ENABLE_TESTS.*ON" ${hostconfig_path} then - if [[ ${truehostname} == "lassen" ]] - if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} - then - ./bin/raja-perf.exe --smpiargs="-disable_gpu_hooks" -sp - else - ./bin/raja-perf.exe --smpiargs="-disable_gpu_hooks" --checkrun -sp - fi + if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} + then + ./bin/raja-perf.exe -sp else - if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} - then - ./bin/raja-perf.exe -sp - else - ./bin/raja-perf.exe --checkrun -sp - fi + ./bin/raja-perf.exe --checkrun -sp fi fi From b416e15b86c7a52e6dc9e75b7b983e3cbce329ea Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Fri, 6 Aug 2021 15:20:06 -0700 Subject: [PATCH 044/392] fixing the command for lassen, cuda jobs --- .gitlab-ci.yml | 2 +- scripts/gitlab/build_and_test.sh | 16 +++++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 0b84a392a..1b194bbc7 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -77,7 +77,7 @@ stages: # allow pre-allocation the same way slurm does. .build_blueos_3_ppc64le_ib_script: script: - - lalloc 1 -W ${DEFAULT_TIME} $(echo "--smpiargs='-disable_gpu_hooks'") scripts/gitlab/build_and_test.sh + - lalloc 1 -W ${DEFAULT_TIME} scripts/gitlab/build_and_test.sh .build_blueos_3_ppc64le_ib_p9_script: extends: .build_blueos_3_ppc64le_ib_script diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index ceea8a172..ad9faccad 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -154,11 +154,21 @@ cd ${build_dir} if grep -q -i "ENABLE_TESTS.*ON" ${hostconfig_path} then - if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} + if grep -q -i "blueos" ${spec} && grep -q -i "ENABLE_CUDA.*ON" ${hostconfig_path} then - ./bin/raja-perf.exe -sp + if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} + then + ./bin/raja-perf.exe --smpiargs="-disable_gpu_hooks" -sp + else + ./bin/raja-perf.exe --smpiargs="-disable_gpu_hooks" --checkrun -sp + fi else - ./bin/raja-perf.exe --checkrun -sp + if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} + then + ./bin/raja-perf.exe -sp + else + ./bin/raja-perf.exe --checkrun -sp + fi fi fi From df9adec2f368abc99c51251395662eb7cd5e4934 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Fri, 6 Aug 2021 15:23:16 -0700 Subject: [PATCH 045/392] removing unnecessary butte command --- .gitlab-ci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1b194bbc7..8be8a8ada 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -49,7 +49,6 @@ stages: - q_build_and_test - q_release_resources - l_build_and_test - - b_build_and_test - c_build_and_test - multi_project From 6e33f9e187e3ca435e78dbd93099d235d50a46de Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Fri, 6 Aug 2021 15:23:44 -0700 Subject: [PATCH 046/392] job titles renamed --- .gitlab/corona-jobs.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitlab/corona-jobs.yml b/.gitlab/corona-jobs.yml index 2b6167389..4d93d3a5a 100644 --- a/.gitlab/corona-jobs.yml +++ b/.gitlab/corona-jobs.yml @@ -5,17 +5,17 @@ ## SPDX-License-Identifier: (BSD-3-Clause) ############################################################################## -hip_4_0_gcc_8_1_0 (build and test on corona): +hip_4_0_gcc_8_1_0: variables: SPEC: "+hip~openmp %gcc@8.1.0 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000' ^hip@4.0.0" extends: .build_and_test_on_corona -hip_4_1_gcc_8_1_0 (build and test on corona): +hip_4_1_gcc_8_1_0: variables: SPEC: "+hip~openmp %gcc@8.1.0 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000' ^hip@4.1.0" extends: .build_and_test_on_corona -hip_4_1_clang_9_0_0 (build and test on corona): +hip_4_1_clang_9_0_0: variables: SPEC: "+hip~openmp %clang@9.0.0 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 ^hip@4.1.0" extends: .build_and_test_on_corona From e53c7c5c6ca1ff7a98e90f7a3cda6302fb8f07a6 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Mon, 9 Aug 2021 11:33:10 -0700 Subject: [PATCH 047/392] changing back to ruby, making PR edits --- .gitlab-ci.yml | 32 +++++------------- .gitlab/corona-templates.yml | 22 +++++++++++++ .gitlab/{quartz-jobs.yml => ruby-jobs.yml} | 12 +++---- ...uartz-templates.yml => ruby-templates.yml} | 33 ++++++++++--------- scripts/gitlab/build_and_test.sh | 2 +- 5 files changed, 55 insertions(+), 46 deletions(-) rename .gitlab/{quartz-jobs.yml => ruby-jobs.yml} (76%) rename .gitlab/{quartz-templates.yml => ruby-templates.yml} (64%) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8be8a8ada..a03a04416 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -18,7 +18,7 @@ # Tells Gitlab to recursively update the submodules when cloning umpire # ALLOC_NAME: -# On LLNL's quartz, this pipeline creates only one allocation shared among jobs +# On LLNL's ruby, this pipeline creates only one allocation shared among jobs # in order to save time and resources. This allocation has to be uniquely named # so that we are sure to retrieve it. @@ -42,15 +42,16 @@ variables: # Normally, stages are blocking in Gitlab. However, using the keyword "needs" we # can express dependencies between job that break the ordering of stages, in # favor of a DAG. -# In practice q_*, l_* and b_* stages are independently run and start immediately. +# In practice r_*, l_* and b_* stages are independently run and start immediately. stages: - - q_allocate_resources - - q_build_and_test - - q_release_resources + - r_allocate_resources + - r_build_and_test + - r_release_resources - l_build_and_test + - c_allocate_resources - c_build_and_test - - multi_project + - c_release_resources # This is the rules that drives the activation of "advanced" jobs. All advanced # jobs will share this through a template mechanism. @@ -81,25 +82,10 @@ stages: .build_blueos_3_ppc64le_ib_p9_script: extends: .build_blueos_3_ppc64le_ib_script -# If testing develop branch, trigger CHAI pipeline with this version of RAJA. -# TODO: Once spack allows to clone a specific commit on demand, then point to the exact commit. -# This will prevent from sticking to a branch (here develop). -# To turn back on chai trigger, add '$CI_COMMIT_BRANCH == "develop" to rule. -trigger-chai: - stage: multi_project - rules: - - if: '$MULTI_PROJECT == "ON"' #run only if ... - variables: - UPDATE_RAJA: develop - trigger: - project: radiuss/chai - branch: develop - strategy: depend - # This is where jobs are included. include: - - local: .gitlab/quartz-templates.yml - - local: .gitlab/quartz-jobs.yml + - local: .gitlab/ruby-templates.yml + - local: .gitlab/ruby-jobs.yml - local: .gitlab/lassen-templates.yml - local: .gitlab/lassen-jobs.yml - local: .gitlab/corona-templates.yml diff --git a/.gitlab/corona-templates.yml b/.gitlab/corona-templates.yml index 9a6405724..7c2eebdc4 100644 --- a/.gitlab/corona-templates.yml +++ b/.gitlab/corona-templates.yml @@ -20,8 +20,29 @@ - if: '$CI_JOB_NAME =~ /release_resources/' when: always - when: on_success +### +## In pre-build phase, allocate a node for builds +## NOTE: Not specifying 'salloc -c 56' should allocate the max number of CPU cores +allocate_resources (on corona): + variables: + GIT_STRATEGY: none + extends: .on_corona + stage: c_allocate_resources + script: + - salloc -N 1 -p pdebug -t 45 --no-shell --job-name=${ALLOC_NAME} ### +# In post-build phase, deallocate resources +# Note : make sure this is run even on build phase failure +release_resources (on corona): + variables: + GIT_STRATEGY: none + extends: .on_corona + stage: c_release_resources + script: + - export JOBID=$(squeue -h --name=${ALLOC_NAME} --format=%A) + - ([[ -n "${JOBID}" ]] && scancel ${JOBID}) + # Generic corona build job, extending build script .build_and_test_on_corona: stage: c_build_and_test @@ -30,3 +51,4 @@ .build_and_test_on_corona_advanced: extends: [.build_and_test_on_corona, .advanced_pipeline] + stage: c_build_and_test diff --git a/.gitlab/quartz-jobs.yml b/.gitlab/ruby-jobs.yml similarity index 76% rename from .gitlab/quartz-jobs.yml rename to .gitlab/ruby-jobs.yml index a1159c5b3..d8e8e95c8 100644 --- a/.gitlab/quartz-jobs.yml +++ b/.gitlab/ruby-jobs.yml @@ -8,30 +8,30 @@ clang_10: variables: SPEC: "%clang@10.0.1" - extends: .build_and_test_on_quartz + extends: .build_and_test_on_ruby clang_9: variables: SPEC: "%clang@9.0.0" - extends: .build_and_test_on_quartz + extends: .build_and_test_on_ruby gcc_8_1_0: variables: SPEC: "%gcc@8.1.0" DEFAULT_TIME: 60 - extends: .build_and_test_on_quartz + extends: .build_and_test_on_ruby icpc_17_0_2: variables: SPEC: "%intel@17.0.2" DEFAULT_TIME: 40 - extends: .build_and_test_on_quartz + extends: .build_and_test_on_ruby icpc_19_1_0: variables: SPEC: "%intel@19.1.0" DEFAULT_TIME: 40 - extends: .build_and_test_on_quartz + extends: .build_and_test_on_ruby # EXTRAS @@ -39,4 +39,4 @@ gcc_4_9_3: variables: SPEC: "%gcc@4.9.3" DEFAULT_TIME: 60 - extends: .build_and_test_on_quartz + extends: .build_and_test_on_ruby diff --git a/.gitlab/quartz-templates.yml b/.gitlab/ruby-templates.yml similarity index 64% rename from .gitlab/quartz-templates.yml rename to .gitlab/ruby-templates.yml index 676d972b7..ecba90dc6 100644 --- a/.gitlab/quartz-templates.yml +++ b/.gitlab/ruby-templates.yml @@ -6,16 +6,16 @@ ############################################################################### #### -# This is the shared configuration of jobs for quartz +# This is the shared configuration of jobs for ruby #### # In pre-build phase, allocate a node for builds -.on_quartz: +.on_ruby: tags: - shell - - quartz + - ruby rules: - - if: '$CI_COMMIT_BRANCH =~ /_qnone/ || $ON_QUARTZ == "OFF"' #run except if ... + - if: '$CI_COMMIT_BRANCH =~ /_qnone/ || $ON_RUBY == "OFF"' #run except if ... when: never - if: '$CI_JOB_NAME =~ /release_resources/' when: always @@ -24,31 +24,32 @@ ### # In pre-build phase, allocate a node for builds # NOTE: Not specifying 'salloc -c 56' should allocate the max number of CPU cores -allocate_resources (on quartz): +allocate_resources (on ruby): variables: GIT_STRATEGY: none - extends: .on_quartz - stage: q_allocate_resources + extends: .on_ruby + stage: r_allocate_resources script: - salloc -N 1 -p pdebug -t 45 --no-shell --job-name=${ALLOC_NAME} ### # In post-build phase, deallocate resources # Note : make sure this is run even on build phase failure -release_resources (on quartz): +release_resources (on ruby): variables: GIT_STRATEGY: none - extends: .on_quartz - stage: q_release_resources + extends: .on_ruby + stage: r_release_resources script: - export JOBID=$(squeue -h --name=${ALLOC_NAME} --format=%A) - ([[ -n "${JOBID}" ]] && scancel ${JOBID}) ### -# Generic quartz build job, extending build script -.build_and_test_on_quartz: - extends: [.build_toss_3_x86_64_ib_script, .on_quartz] - stage: q_build_and_test +# Generic ruby build job, extending build script +.build_and_test_on_ruby: + extends: [.build_toss_3_x86_64_ib_script, .on_ruby] + stage: r_build_and_test -.build_and_test_on_quartz_advanced: - extends: [.build_and_test_on_quartz, .advanced_pipeline] +.build_and_test_on_ruby_advanced: + extends: [.build_and_test_on_ruby, .advanced_pipeline] + stage: r_build_and_test diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index ad9faccad..aadca7f71 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -113,7 +113,7 @@ then echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" # Map CPU core allocations - declare -A core_counts=(["lassen"]=40 ["quartz"]=28 ["corona"]=32) + declare -A core_counts=(["lassen"]=40 ["ruby"]=28 ["corona"]=32) # If building, then delete everything first # NOTE: 'cmake --build . -j core_counts' attempts to reduce individual build resources. From 505dcd8887ad7ab9c33fd044409a734238bd29e7 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Mon, 9 Aug 2021 11:41:14 -0700 Subject: [PATCH 048/392] fixing how i search for blueos --- scripts/gitlab/build_and_test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index aadca7f71..3f18030a3 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -154,7 +154,7 @@ cd ${build_dir} if grep -q -i "ENABLE_TESTS.*ON" ${hostconfig_path} then - if grep -q -i "blueos" ${spec} && grep -q -i "ENABLE_CUDA.*ON" ${hostconfig_path} + if grep -q -i "blueos" ${sys_type} && grep -q -i "ENABLE_CUDA.*ON" ${hostconfig_path} then if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} then From 3020e94b5faf8767bea17120ecd4bea11315a75e Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Mon, 9 Aug 2021 12:11:55 -0700 Subject: [PATCH 049/392] edits to gitlab script --- scripts/gitlab/build_and_test.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 3f18030a3..be7273a17 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -159,15 +159,27 @@ then if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} then ./bin/raja-perf.exe --smpiargs="-disable_gpu_hooks" -sp + echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" + echo "./bin/raja-perf.exe --smpiargs='-disable_gpu_hooks' -sp" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" else ./bin/raja-perf.exe --smpiargs="-disable_gpu_hooks" --checkrun -sp + echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" + echo "./bin/raja-perf.exe --smpiargs='-disable_gpu_hook' --checkrun -sp" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" fi else if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} then ./bin/raja-perf.exe -sp + echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" + echo "./bin/raja-perf.exe -sp" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" else ./bin/raja-perf.exe --checkrun -sp + echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" + echo "./bin/raja-perf.exe --checkrun -sp" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" fi fi fi From 29e8a6206ef521777a533dd43d9e3f4bf84d07d3 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Mon, 9 Aug 2021 13:47:22 -0700 Subject: [PATCH 050/392] trying to fix the way build options are found --- scripts/gitlab/build_and_test.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index be7273a17..f8ef12ea8 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -126,7 +126,7 @@ then cmake \ -C ${hostconfig_path} \ ${project_dir} - if grep -q -i "intel" ${spec} + if grep "intel" ${spec} then cmake --build . -j 16 echo "~~~~~~~~~ Build Command: ~~~~~~~~~~~~~~~~~~~~~" @@ -152,11 +152,11 @@ fi cd ${build_dir} -if grep -q -i "ENABLE_TESTS.*ON" ${hostconfig_path} +if grep "ENABLE_TESTS.*ON" ${hostconfig_path} then - if grep -q -i "blueos" ${sys_type} && grep -q -i "ENABLE_CUDA.*ON" ${hostconfig_path} + if grep "blueos" ${sys_type} && grep "cuda" ${spec} then - if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} + if grep "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} then ./bin/raja-perf.exe --smpiargs="-disable_gpu_hooks" -sp echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" @@ -169,7 +169,7 @@ then echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" fi else - if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} + if grep "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} then ./bin/raja-perf.exe -sp echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" From 56d986745a86844a0955a452ce3a7ce4901638dc Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Mon, 9 Aug 2021 16:36:44 -0700 Subject: [PATCH 051/392] trying to search for cuda/blueos builds --- scripts/gitlab/build_and_test.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index f8ef12ea8..7c6afac75 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -126,7 +126,7 @@ then cmake \ -C ${hostconfig_path} \ ${project_dir} - if grep "intel" ${spec} + if grep -q "intel" ${spec} then cmake --build . -j 16 echo "~~~~~~~~~ Build Command: ~~~~~~~~~~~~~~~~~~~~~" @@ -152,11 +152,11 @@ fi cd ${build_dir} -if grep "ENABLE_TESTS.*ON" ${hostconfig_path} +if grep -q "ENABLE_TESTS.*ON" ${hostconfig} then - if grep "blueos" ${sys_type} && grep "cuda" ${spec} + if grep -q "blueos" ${sys_type} && grep -q "cuda" ${spec} then - if grep "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} + if grep -q "CMAKE_BUILD_TYPE.*Release" ${hostconfig} then ./bin/raja-perf.exe --smpiargs="-disable_gpu_hooks" -sp echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" @@ -169,7 +169,7 @@ then echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" fi else - if grep "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} + if grep -q "CMAKE_BUILD_TYPE.*Release" ${hostconfig} then ./bin/raja-perf.exe -sp echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" From 6a2def10dd67e9fab874afe8b277b85fc0da5159 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Tue, 10 Aug 2021 09:09:32 -0700 Subject: [PATCH 052/392] fixes to bash script grep commands --- scripts/gitlab/build_and_test.sh | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 7c6afac75..18d07c956 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -126,8 +126,7 @@ then cmake \ -C ${hostconfig_path} \ ${project_dir} - if grep -q "intel" ${spec} - then + if echo ${spec} | grep -q "intel" ; then cmake --build . -j 16 echo "~~~~~~~~~ Build Command: ~~~~~~~~~~~~~~~~~~~~~" echo "cmake --build . -j 16" @@ -152,11 +151,10 @@ fi cd ${build_dir} -if grep -q "ENABLE_TESTS.*ON" ${hostconfig} +if grep -q -i "ENABLE_TESTS.*ON" ${hostconfig_path} then - if grep -q "blueos" ${sys_type} && grep -q "cuda" ${spec} - then - if grep -q "CMAKE_BUILD_TYPE.*Release" ${hostconfig} + if echo ${sys_type} | grep -q "blueos" && echo ${spec} | grep -q "cuda" ; then + if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} then ./bin/raja-perf.exe --smpiargs="-disable_gpu_hooks" -sp echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" @@ -169,7 +167,7 @@ then echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" fi else - if grep -q "CMAKE_BUILD_TYPE.*Release" ${hostconfig} + if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} then ./bin/raja-perf.exe -sp echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" From 93a87c40f9ddf568027deb2e6448b7d37dddbb98 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 10 Aug 2021 11:06:30 -0700 Subject: [PATCH 053/392] Try to launch azure-ci. --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 53e3c9416..d2e1ddd62 100644 --- a/Dockerfile +++ b/Dockerfile @@ -60,7 +60,7 @@ COPY --chown=axom:axom . /home/axom/workspace WORKDIR /home/axom/workspace RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug -DENABLE_OPENMP=On -DCMAKE_CXX_FLAGS=-fsanitize=address .. RUN cd build && make -j 16 -RUN cd build && ./bin/raja-perf.exe --checkrun -sp +RUN cd build && ./bin/raja-perf.exe --checkrun FROM axom/compilers:nvcc-10.2 AS nvcc10 ENV GTEST_COLOR=1 From 8a87a18b03e5e06e668c1739cd057877d3821183 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 10 Aug 2021 11:08:57 -0700 Subject: [PATCH 054/392] Try to launch azure-ci. --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index d2e1ddd62..3b40618fd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,3 +1,4 @@ + ############################################################################### # Copyright (c) 2016-21, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJA/COPYRIGHT file for details. From 692e6f89dfcac47725da618ef4ed0c3bd0515922 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 10 Aug 2021 11:15:18 -0700 Subject: [PATCH 055/392] Try to launch azure-ci. --- Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 3b40618fd..d2e1ddd62 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,3 @@ - ############################################################################### # Copyright (c) 2016-21, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJA/COPYRIGHT file for details. From a9f5bd607b6a400e713a6d22e10e2c2c9e3134a1 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Tue, 10 Aug 2021 11:24:01 -0700 Subject: [PATCH 056/392] Add -sp back to clang-debug build. --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index d2e1ddd62..53e3c9416 100644 --- a/Dockerfile +++ b/Dockerfile @@ -60,7 +60,7 @@ COPY --chown=axom:axom . /home/axom/workspace WORKDIR /home/axom/workspace RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug -DENABLE_OPENMP=On -DCMAKE_CXX_FLAGS=-fsanitize=address .. RUN cd build && make -j 16 -RUN cd build && ./bin/raja-perf.exe --checkrun +RUN cd build && ./bin/raja-perf.exe --checkrun -sp FROM axom/compilers:nvcc-10.2 AS nvcc10 ENV GTEST_COLOR=1 From 1c80929edfcfce7ee44cf7be15eac6b03e6b5507 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Tue, 10 Aug 2021 14:35:20 -0700 Subject: [PATCH 057/392] fixing corona stages and lassen+cuda run command --- .gitlab/corona-templates.yml | 4 ++-- scripts/gitlab/build_and_test.sh | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.gitlab/corona-templates.yml b/.gitlab/corona-templates.yml index 7c2eebdc4..697cdb84c 100644 --- a/.gitlab/corona-templates.yml +++ b/.gitlab/corona-templates.yml @@ -15,7 +15,7 @@ - shell - corona rules: - - if: '$ON_CORONA == "OFF"' #run except if ... + - if: '$CI_COMMIT_BRANCH =~ /_qnone/ || $ON_CORONA == "OFF"' #run except if ... when: never - if: '$CI_JOB_NAME =~ /release_resources/' when: always @@ -29,7 +29,7 @@ allocate_resources (on corona): extends: .on_corona stage: c_allocate_resources script: - - salloc -N 1 -p pdebug -t 45 --no-shell --job-name=${ALLOC_NAME} + - salloc -N 1 -pmi60 -t 45 --no-shell --job-name=${ALLOC_NAME} ### # In post-build phase, deallocate resources diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 18d07c956..b7c90b0e8 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -156,14 +156,14 @@ then if echo ${sys_type} | grep -q "blueos" && echo ${spec} | grep -q "cuda" ; then if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} then - ./bin/raja-perf.exe --smpiargs="-disable_gpu_hooks" -sp + --smpiargs="-disable_gpu_hooks" ./bin/raja-perf.exe -sp echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" - echo "./bin/raja-perf.exe --smpiargs='-disable_gpu_hooks' -sp" + echo ".--smpiargs='-disable_gpu_hooks' /bin/raja-perf.exe -sp" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" else - ./bin/raja-perf.exe --smpiargs="-disable_gpu_hooks" --checkrun -sp + --smpiargs="-disable_gpu_hooks" ./bin/raja-perf.exe --checkrun -sp echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" - echo "./bin/raja-perf.exe --smpiargs='-disable_gpu_hook' --checkrun -sp" + echo "--smpiargs='-disable_gpu_hook' ./bin/raja-perf.exe --checkrun -sp" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" fi else From 31f7eb230a221968ce4234ba48fe91dd10c8315f Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Wed, 11 Aug 2021 15:41:14 -0700 Subject: [PATCH 058/392] trying a fix for lassen+cuda jobs --- .gitlab-ci.yml | 3 ++- .gitlab/lassen-jobs.yml | 1 + scripts/gitlab/build_and_test.sh | 30 +++++++++++++++--------------- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a03a04416..ba5d53227 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -38,6 +38,7 @@ variables: ALLOC_NAME: ${CI_PROJECT_NAME}_ci_${CI_PIPELINE_ID} BUILD_ROOT: ${CI_PROJECT_DIR} DEFAULT_TIME: 30 + EXTRA_FLAGS: "" # Normally, stages are blocking in Gitlab. However, using the keyword "needs" we # can express dependencies between job that break the ordering of stages, in @@ -77,7 +78,7 @@ stages: # allow pre-allocation the same way slurm does. .build_blueos_3_ppc64le_ib_script: script: - - lalloc 1 -W ${DEFAULT_TIME} scripts/gitlab/build_and_test.sh + - lalloc 1 ${EXTRA_FLAGS} -W ${DEFAULT_TIME} scripts/gitlab/build_and_test.sh .build_blueos_3_ppc64le_ib_p9_script: extends: .build_blueos_3_ppc64le_ib_script diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index cfee9404d..126a89ebd 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -53,6 +53,7 @@ ibm_clang_11_gcc_8_cuda: gcc_8_3_1_cuda: variables: SPEC: "+cuda %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168" + EXTRA_FLAGS: '--smpiargs="-disable_gpu_hooks"' extends: .build_and_test_on_lassen xl_16_1_1_7_cuda: diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index b7c90b0e8..0f7629b8c 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -153,20 +153,20 @@ cd ${build_dir} if grep -q -i "ENABLE_TESTS.*ON" ${hostconfig_path} then - if echo ${sys_type} | grep -q "blueos" && echo ${spec} | grep -q "cuda" ; then - if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} - then - --smpiargs="-disable_gpu_hooks" ./bin/raja-perf.exe -sp - echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" - echo ".--smpiargs='-disable_gpu_hooks' /bin/raja-perf.exe -sp" - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - else - --smpiargs="-disable_gpu_hooks" ./bin/raja-perf.exe --checkrun -sp - echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" - echo "--smpiargs='-disable_gpu_hook' ./bin/raja-perf.exe --checkrun -sp" - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - fi - else + #if echo ${sys_type} | grep -q "blueos" && echo ${spec} | grep -q "cuda" ; then + # if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} + # then + # --smpiargs="-disable_gpu_hooks" ./bin/raja-perf.exe -sp + # echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" + # echo ".--smpiargs='-disable_gpu_hooks' /bin/raja-perf.exe -sp" + # echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + # else + # --smpiargs="-disable_gpu_hooks" ./bin/raja-perf.exe --checkrun -sp + # echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" + # echo "--smpiargs='-disable_gpu_hook' ./bin/raja-perf.exe --checkrun -sp" + # echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + # fi + #else if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} then ./bin/raja-perf.exe -sp @@ -179,7 +179,7 @@ then echo "./bin/raja-perf.exe --checkrun -sp" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" fi - fi + #fi fi echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" From f1ad3cf03b130e17e8fdbb81f498e7ea18094b12 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Thu, 12 Aug 2021 14:47:40 -0700 Subject: [PATCH 059/392] fixing lassen+cuda job command --- .gitlab-ci.yml | 2 +- .gitlab/lassen-jobs.yml | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ba5d53227..591a2582c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -78,7 +78,7 @@ stages: # allow pre-allocation the same way slurm does. .build_blueos_3_ppc64le_ib_script: script: - - lalloc 1 ${EXTRA_FLAGS} -W ${DEFAULT_TIME} scripts/gitlab/build_and_test.sh + - lalloc 1 -W ${DEFAULT_TIME} scripts/gitlab/build_and_test.sh ${EXTRA_FLAGS} .build_blueos_3_ppc64le_ib_p9_script: extends: .build_blueos_3_ppc64le_ib_script diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index 126a89ebd..ec26c1e79 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -43,11 +43,13 @@ xl_16_1_1_7_gcc_8_3_1: ibm_clang_11_cuda: variables: SPEC: "+cuda cuda_arch=70 %clang@11.0.0ibm ^cuda@10.1.168" + EXTRA_FLAGS: '--smpiargs="-disable_gpu_hooks"' extends: .build_and_test_on_lassen ibm_clang_11_gcc_8_cuda: variables: SPEC: "+cuda %clang@11.0.0ibm cuda_arch=70 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@10.1.168" + EXTRA_FLAGS: '--smpiargs="-disable_gpu_hooks"' extends: .build_and_test_on_lassen gcc_8_3_1_cuda: @@ -58,14 +60,16 @@ gcc_8_3_1_cuda: xl_16_1_1_7_cuda: variables: - SPEC: "+cuda %xl@16.1.1.7 tests=none cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5" + SPEC: "+cuda %xl@16.1.1.7 cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5" DEFAULT_TIME: 60 + EXTRA_FLAGS: '--smpiargs="-disable_gpu_hooks"' allow_failure: true extends: .build_and_test_on_lassen xl_16_1_1_7_gcc_8_3_1_cuda_11: variables: - SPEC: "+cuda %xl@16.1.1.7 tests=none cuda_arch=70 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@11.0.2 ^cmake@3.14.5" + SPEC: "+cuda %xl@16.1.1.7 cuda_arch=70 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@11.0.2 ^cmake@3.14.5" + EXTRA_FLAGS: '--smpiargs="-disable_gpu_hooks"' DEFAULT_TIME: 60 allow_failure: true extends: .build_and_test_on_lassen From ec6c7e160400ce895e35bfaf9e4548c20947bf99 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Tue, 17 Aug 2021 08:10:35 -0700 Subject: [PATCH 060/392] fixing flag for lassen+cuda jobs, adding back in tests for clang --- .gitlab-ci.yml | 3 +-- .gitlab/lassen-jobs.yml | 5 ---- scripts/gitlab/build_and_test.sh | 28 ++++++++++----------- scripts/spack_packages/raja_perf/package.py | 10 ++++---- 4 files changed, 20 insertions(+), 26 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 591a2582c..a03a04416 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -38,7 +38,6 @@ variables: ALLOC_NAME: ${CI_PROJECT_NAME}_ci_${CI_PIPELINE_ID} BUILD_ROOT: ${CI_PROJECT_DIR} DEFAULT_TIME: 30 - EXTRA_FLAGS: "" # Normally, stages are blocking in Gitlab. However, using the keyword "needs" we # can express dependencies between job that break the ordering of stages, in @@ -78,7 +77,7 @@ stages: # allow pre-allocation the same way slurm does. .build_blueos_3_ppc64le_ib_script: script: - - lalloc 1 -W ${DEFAULT_TIME} scripts/gitlab/build_and_test.sh ${EXTRA_FLAGS} + - lalloc 1 -W ${DEFAULT_TIME} scripts/gitlab/build_and_test.sh .build_blueos_3_ppc64le_ib_p9_script: extends: .build_blueos_3_ppc64le_ib_script diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index ec26c1e79..14501c6d2 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -43,33 +43,28 @@ xl_16_1_1_7_gcc_8_3_1: ibm_clang_11_cuda: variables: SPEC: "+cuda cuda_arch=70 %clang@11.0.0ibm ^cuda@10.1.168" - EXTRA_FLAGS: '--smpiargs="-disable_gpu_hooks"' extends: .build_and_test_on_lassen ibm_clang_11_gcc_8_cuda: variables: SPEC: "+cuda %clang@11.0.0ibm cuda_arch=70 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@10.1.168" - EXTRA_FLAGS: '--smpiargs="-disable_gpu_hooks"' extends: .build_and_test_on_lassen gcc_8_3_1_cuda: variables: SPEC: "+cuda %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168" - EXTRA_FLAGS: '--smpiargs="-disable_gpu_hooks"' extends: .build_and_test_on_lassen xl_16_1_1_7_cuda: variables: SPEC: "+cuda %xl@16.1.1.7 cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5" DEFAULT_TIME: 60 - EXTRA_FLAGS: '--smpiargs="-disable_gpu_hooks"' allow_failure: true extends: .build_and_test_on_lassen xl_16_1_1_7_gcc_8_3_1_cuda_11: variables: SPEC: "+cuda %xl@16.1.1.7 cuda_arch=70 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@11.0.2 ^cmake@3.14.5" - EXTRA_FLAGS: '--smpiargs="-disable_gpu_hooks"' DEFAULT_TIME: 60 allow_failure: true extends: .build_and_test_on_lassen diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 0f7629b8c..022f9c84b 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -153,20 +153,20 @@ cd ${build_dir} if grep -q -i "ENABLE_TESTS.*ON" ${hostconfig_path} then - #if echo ${sys_type} | grep -q "blueos" && echo ${spec} | grep -q "cuda" ; then - # if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} - # then - # --smpiargs="-disable_gpu_hooks" ./bin/raja-perf.exe -sp - # echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" - # echo ".--smpiargs='-disable_gpu_hooks' /bin/raja-perf.exe -sp" - # echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - # else - # --smpiargs="-disable_gpu_hooks" ./bin/raja-perf.exe --checkrun -sp - # echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" - # echo "--smpiargs='-disable_gpu_hook' ./bin/raja-perf.exe --checkrun -sp" - # echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - # fi - #else + if echo ${sys_type} | grep -q "blueos" && echo ${spec} | grep -q "cuda" ; then + if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} + then + lrun --smpiargs="-disable_gpu_hooks" ./bin/raja-perf.exe -sp + echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" + echo "lrun --smpiargs='-disable_gpu_hooks' /bin/raja-perf.exe -sp" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + else + lrun --smpiargs="-disable_gpu_hooks" ./bin/raja-perf.exe --checkrun -sp + echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" + echo "lrun --smpiargs='-disable_gpu_hook' ./bin/raja-perf.exe --checkrun -sp" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + fi + else if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} then ./bin/raja-perf.exe -sp diff --git a/scripts/spack_packages/raja_perf/package.py b/scripts/spack_packages/raja_perf/package.py index 046eb73de..0466528c1 100644 --- a/scripts/spack_packages/raja_perf/package.py +++ b/scripts/spack_packages/raja_perf/package.py @@ -327,11 +327,11 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): # BLT removes -Werror from GTest flags # Note 2: Tests are either built if variant is set, or if run-tests # option is passed. - if self.spec.satisfies('%clang target=ppc64le:'): - cfg.write(cmake_cache_option("ENABLE_TESTS",False)) - if 'tests=benchmarks' in spec or not 'tests=none' in spec: - print("MSG: no testing supported on %clang target=ppc64le:") - else: + #if self.spec.satisfies('%clang target=ppc64le:'): + # cfg.write(cmake_cache_option("ENABLE_TESTS",False)) + # if 'tests=benchmarks' in spec or not 'tests=none' in spec: + # print("MSG: no testing supported on %clang target=ppc64le:") + #else: cfg.write(cmake_cache_option("ENABLE_BENCHMARKS", 'tests=benchmarks' in spec)) cfg.write(cmake_cache_option("ENABLE_TESTS", not 'tests=none' in spec or self.run_tests)) From 915e5bfa9355bb8d89a5d5f7832359df65d03064 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Tue, 17 Aug 2021 08:29:00 -0700 Subject: [PATCH 061/392] fixing syntax error in package file --- scripts/spack_packages/raja_perf/package.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/spack_packages/raja_perf/package.py b/scripts/spack_packages/raja_perf/package.py index 0466528c1..c8664202d 100644 --- a/scripts/spack_packages/raja_perf/package.py +++ b/scripts/spack_packages/raja_perf/package.py @@ -332,8 +332,8 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): # if 'tests=benchmarks' in spec or not 'tests=none' in spec: # print("MSG: no testing supported on %clang target=ppc64le:") #else: - cfg.write(cmake_cache_option("ENABLE_BENCHMARKS", 'tests=benchmarks' in spec)) - cfg.write(cmake_cache_option("ENABLE_TESTS", not 'tests=none' in spec or self.run_tests)) + cfg.write(cmake_cache_option("ENABLE_BENCHMARKS", 'tests=benchmarks' in spec)) + #cfg.write(cmake_cache_option("ENABLE_TESTS", not 'tests=none' in spec or self.run_tests)) ####################### # Close and save From ceb72c3eb83b6fda01a911d0550b5d2ce78244a2 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Tue, 17 Aug 2021 09:42:40 -0700 Subject: [PATCH 062/392] fixed syntax error in build-and-test script --- scripts/gitlab/build_and_test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 022f9c84b..8da59fe53 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -179,7 +179,7 @@ then echo "./bin/raja-perf.exe --checkrun -sp" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" fi - #fi + fi fi echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" From d19c4d9b6e705de0e7caa918409d8d75be96af9b Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Tue, 17 Aug 2021 10:17:58 -0700 Subject: [PATCH 063/392] editing when tests are turned on and off --- scripts/spack_packages/raja_perf/package.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/scripts/spack_packages/raja_perf/package.py b/scripts/spack_packages/raja_perf/package.py index c8664202d..27fff2d8b 100644 --- a/scripts/spack_packages/raja_perf/package.py +++ b/scripts/spack_packages/raja_perf/package.py @@ -205,7 +205,6 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): if cxxflags: cfg.write(cmake_cache_entry("CMAKE_CXX_FLAGS", cxxflags)) - # TODO (bernede1@llnl.gov): Is this useful for RAJA? if ("gfortran" in f_compiler) and ("clang" in cpp_compiler): libdir = pjoin(os.path.dirname( os.path.dirname(f_compiler)), "lib") @@ -258,10 +257,12 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cuda_debug_flags = "-O0 -g" cfg.write(cmake_cache_string("BLT_CXX_STD", "c++14")) + cfg.write(cmake_cache_option("ENABLE_TESTS", False)) else: cuda_release_flags = "-O3 -Xcompiler -Ofast -Xcompiler -finline-functions -Xcompiler -finline-limit=20000" cuda_reldebinf_flags = "-O3 -g -Xcompiler -Ofast -Xcompiler -finline-functions -Xcompiler -finline-limit=20000" cuda_debug_flags = "-O0 -g -Xcompiler -O0 -Xcompiler -finline-functions -Xcompiler -finline-limit=20000" + cfg.write(cmake_cache_option("ENABLE_TESTS", True)) cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELEASE", cuda_release_flags)) cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELWITHDEBINFO", cuda_reldebinf_flags)) @@ -280,6 +281,7 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cfg.write("#------------------{0}\n\n".format("-" * 60)) cfg.write(cmake_cache_option("ENABLE_HIP", True)) + cfg.write(cmake_cache_option("ENABLE_TESTS", True)) hip_root = spec['hip'].prefix rocm_root = hip_root + "/.." @@ -327,13 +329,13 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): # BLT removes -Werror from GTest flags # Note 2: Tests are either built if variant is set, or if run-tests # option is passed. - #if self.spec.satisfies('%clang target=ppc64le:'): - # cfg.write(cmake_cache_option("ENABLE_TESTS",False)) - # if 'tests=benchmarks' in spec or not 'tests=none' in spec: - # print("MSG: no testing supported on %clang target=ppc64le:") - #else: - cfg.write(cmake_cache_option("ENABLE_BENCHMARKS", 'tests=benchmarks' in spec)) - #cfg.write(cmake_cache_option("ENABLE_TESTS", not 'tests=none' in spec or self.run_tests)) + if ("+cuda" in spec) and (self.spec.satisfies('%clang target=ppc64le:')): + cfg.write(cmake_cache_option("ENABLE_TESTS",False)) + if 'tests=benchmarks' in spec or not 'tests=none' in spec: + print("MSG: no testing supported on %clang target=ppc64le:") + else: + cfg.write(cmake_cache_option("ENABLE_BENCHMARKS", 'tests=benchmarks' in spec)) + cfg.write(cmake_cache_option("ENABLE_TESTS", not 'tests=none' in spec or self.run_tests)) ####################### # Close and save From 699f67d9f8ee4a3c930e2496adb7a64572ef6335 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 17 Aug 2021 11:55:17 -0700 Subject: [PATCH 064/392] Add NODAL_ACCUMULATION_3D kernel --- src/CMakeLists.txt | 3 + src/apps/CMakeLists.txt | 6 + src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp | 122 +++++++++++++++ src/apps/NODAL_ACCUMULATION_3D-Hip.cpp | 122 +++++++++++++++ src/apps/NODAL_ACCUMULATION_3D-OMP.cpp | 147 +++++++++++++++++++ src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp | 126 ++++++++++++++++ src/apps/NODAL_ACCUMULATION_3D-Seq.cpp | 104 +++++++++++++ src/apps/NODAL_ACCUMULATION_3D.cpp | 97 ++++++++++++ src/apps/NODAL_ACCUMULATION_3D.hpp | 111 ++++++++++++++ src/common/RAJAPerfSuite.cpp | 6 + src/common/RAJAPerfSuite.hpp | 1 + 11 files changed, 845 insertions(+) create mode 100644 src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp create mode 100644 src/apps/NODAL_ACCUMULATION_3D-Hip.cpp create mode 100644 src/apps/NODAL_ACCUMULATION_3D-OMP.cpp create mode 100644 src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp create mode 100644 src/apps/NODAL_ACCUMULATION_3D-Seq.cpp create mode 100644 src/apps/NODAL_ACCUMULATION_3D.cpp create mode 100644 src/apps/NODAL_ACCUMULATION_3D.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 877bf5306..0d38a29b9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -60,6 +60,9 @@ blt_add_executable( apps/MASS3DPA.cpp apps/MASS3DPA-Seq.cpp apps/MASS3DPA-OMPTarget.cpp + apps/NODAL_ACCUMULATION_3D.cpp + apps/NODAL_ACCUMULATION_3D-Seq.cpp + apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp apps/VOL3D.cpp apps/VOL3D-Seq.cpp apps/VOL3D-OMPTarget.cpp diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index 16b822cbb..8b439997d 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -57,6 +57,12 @@ blt_add_library( MASS3DPA-Seq.cpp MASS3DPA-OMP.cpp MASS3DPA-OMPTarget.cpp + NODAL_ACCUMULATION_3D.cpp + NODAL_ACCUMULATION_3D-Seq.cpp + NODAL_ACCUMULATION_3D-Hip.cpp + NODAL_ACCUMULATION_3D-Cuda.cpp + NODAL_ACCUMULATION_3D-OMP.cpp + NODAL_ACCUMULATION_3D-OMPTarget.cpp PRESSURE.cpp PRESSURE-Seq.cpp PRESSURE-Hip.cpp diff --git a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp new file mode 100644 index 000000000..3d29a5257 --- /dev/null +++ b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp @@ -0,0 +1,122 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "NODAL_ACCUMULATION_3D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + // + // Define thread block size for CUDA execution + // + const size_t block_size = 256; + + +#define NODAL_ACCUMULATION_3D_DATA_SETUP_CUDA \ + allocAndInitCudaDeviceData(x, m_x, m_nodal_array_length); \ + allocAndInitCudaDeviceData(vol, m_vol, m_zonal_array_length); \ + allocAndInitCudaDeviceData(real_zones, m_domain->real_zones, iend); + +#define NODAL_ACCUMULATION_3D_DATA_TEARDOWN_CUDA \ + getCudaDeviceData(m_x, x, m_nodal_array_length); \ + deallocCudaDeviceData(x); \ + deallocCudaDeviceData(vol); \ + deallocCudaDeviceData(real_zones); + +__global__ void nodal_accumulation_3d(Real_ptr vol, + Real_ptr x0, Real_ptr x1, + Real_ptr x2, Real_ptr x3, + Real_ptr x4, Real_ptr x5, + Real_ptr x6, Real_ptr x7, + Index_ptr real_zones, + Index_type ibegin, Index_type iend) +{ + Index_type ii = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = ii + ibegin; + if (i < iend) { + NODAL_ACCUMULATION_3D_BODY_INDEX; + NODAL_ACCUMULATION_3D_RAJA_ATOMIC_BODY(RAJA::cuda_atomic); + } +} + + +void NODAL_ACCUMULATION_3D::runCudaVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + NODAL_ACCUMULATION_3D_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + NODAL_ACCUMULATION_3D_DATA_SETUP_CUDA; + + NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + + nodal_accumulation_3d<<>>(vol, + x0, x1, x2, x3, x4, x5, x6, x7, + real_zones, + ibegin, iend); + cudaErrchk( cudaGetLastError() ); + + } + stopTimer(); + + NODAL_ACCUMULATION_3D_DATA_TEARDOWN_CUDA; + + } else if ( vid == RAJA_CUDA ) { + + NODAL_ACCUMULATION_3D_DATA_SETUP_CUDA; + + NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; + + camp::resources::Resource working_res{camp::resources::Cuda()}; + RAJA::TypedListSegment zones(m_domain->real_zones, + m_domain->n_real_zones, + working_res); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::cuda_exec >( + zones, [=] __device__ (Index_type i) { + NODAL_ACCUMULATION_3D_RAJA_ATOMIC_BODY(RAJA::cuda_atomic); + }); + + } + stopTimer(); + + NODAL_ACCUMULATION_3D_DATA_TEARDOWN_CUDA; + + } else { + std::cout << "\n NODAL_ACCUMULATION_3D : Unknown Cuda variant id = " << vid << std::endl; + } +} + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp new file mode 100644 index 000000000..f9c2f5fe6 --- /dev/null +++ b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp @@ -0,0 +1,122 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "NODAL_ACCUMULATION_3D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + // + // Define thread block size for HIP execution + // + const size_t block_size = 256; + + +#define NODAL_ACCUMULATION_3D_DATA_SETUP_HIP \ + allocAndInitHipDeviceData(x, m_x, m_nodal_array_length); \ + allocAndInitHipDeviceData(vol, m_vol, m_zonal_array_length); \ + allocAndInitHipDeviceData(real_zones, m_domain->real_zones, iend); + +#define NODAL_ACCUMULATION_3D_DATA_TEARDOWN_HIP \ + getHipDeviceData(m_x, x, m_nodal_array_length); \ + deallocHipDeviceData(x); \ + deallocHipDeviceData(vol); \ + deallocHipDeviceData(real_zones); + +__global__ void nodal_accumulation_3d(Real_ptr vol, + Real_ptr x0, Real_ptr x1, + Real_ptr x2, Real_ptr x3, + Real_ptr x4, Real_ptr x5, + Real_ptr x6, Real_ptr x7, + Index_ptr real_zones, + Index_type ibegin, Index_type iend) +{ + Index_type ii = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = ii + ibegin; + if (i < iend) { + NODAL_ACCUMULATION_3D_BODY_INDEX; + NODAL_ACCUMULATION_3D_RAJA_ATOMIC_BODY(RAJA::hip_atomic); + } +} + + +void NODAL_ACCUMULATION_3D::runHipVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + NODAL_ACCUMULATION_3D_DATA_SETUP; + + if ( vid == Base_HIP ) { + + NODAL_ACCUMULATION_3D_DATA_SETUP_HIP; + + NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + + hipLaunchKernelGGL((nodal_accumulation_3d), dim3(grid_size), dim3(block_size), 0, 0, vol, + x0, x1, x2, x3, x4, x5, x6, x7, + real_zones, + ibegin, iend); + hipErrchk( hipGetLastError() ); + + } + stopTimer(); + + NODAL_ACCUMULATION_3D_DATA_TEARDOWN_HIP; + + } else if ( vid == RAJA_HIP ) { + + NODAL_ACCUMULATION_3D_DATA_SETUP_HIP; + + NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; + + camp::resources::Resource working_res{camp::resources::Hip()}; + RAJA::TypedListSegment zones(m_domain->real_zones, + m_domain->n_real_zones, + working_res); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::hip_exec >( + zones, [=] __device__ (Index_type i) { + NODAL_ACCUMULATION_3D_RAJA_ATOMIC_BODY(RAJA::hip_atomic); + }); + + } + stopTimer(); + + NODAL_ACCUMULATION_3D_DATA_TEARDOWN_HIP; + + } else { + std::cout << "\n NODAL_ACCUMULATION_3D : Unknown Hip variant id = " << vid << std::endl; + } +} + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp b/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp new file mode 100644 index 000000000..9676e1657 --- /dev/null +++ b/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp @@ -0,0 +1,147 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "NODAL_ACCUMULATION_3D.hpp" + +#include "RAJA/RAJA.hpp" + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +void NODAL_ACCUMULATION_3D::runOpenMPVariant(VariantID vid) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + NODAL_ACCUMULATION_3D_DATA_SETUP; + + NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; + + + switch ( vid ) { + + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type ii = ibegin ; ii < iend ; ++ii ) { + NODAL_ACCUMULATION_3D_BODY_INDEX; + + Real_type val = 0.125 * vol[i]; + + #pragma omp atomic + x0[i] += val; + #pragma omp atomic + x1[i] += val; + #pragma omp atomic + x2[i] += val; + #pragma omp atomic + x3[i] += val; + #pragma omp atomic + x4[i] += val; + #pragma omp atomic + x5[i] += val; + #pragma omp atomic + x6[i] += val; + #pragma omp atomic + x7[i] += val; + } + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP : { + + auto nodal_accumulation_3d_lam = [=](Index_type ii) { + NODAL_ACCUMULATION_3D_BODY_INDEX; + + Real_type val = 0.125 * vol[i]; + + #pragma omp atomic + x0[i] += val; + #pragma omp atomic + x1[i] += val; + #pragma omp atomic + x2[i] += val; + #pragma omp atomic + x3[i] += val; + #pragma omp atomic + x4[i] += val; + #pragma omp atomic + x5[i] += val; + #pragma omp atomic + x6[i] += val; + #pragma omp atomic + x7[i] += val; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type ii = ibegin ; ii < iend ; ++ii ) { + nodal_accumulation_3d_lam(ii); + } + + } + stopTimer(); + + break; + } + + case RAJA_OpenMP : { + + camp::resources::Resource working_res{camp::resources::Host()}; + RAJA::TypedListSegment zones(m_domain->real_zones, + m_domain->n_real_zones, + working_res); + + auto nodal_accumulation_3d_lam = [=](Index_type i) { + NODAL_ACCUMULATION_3D_RAJA_ATOMIC_BODY(RAJA::omp_atomic); + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + zones, nodal_accumulation_3d_lam); + + } + stopTimer(); + + break; + } + + default : { + std::cout << "\n NODAL_ACCUMULATION_3D : Unknown variant id = " << vid << std::endl; + } + + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp b/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp new file mode 100644 index 000000000..b53dad400 --- /dev/null +++ b/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp @@ -0,0 +1,126 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "NODAL_ACCUMULATION_3D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + +#define NODAL_ACCUMULATION_3D_DATA_SETUP_OMP_TARGET \ + int hid = omp_get_initial_device(); \ + int did = omp_get_default_device(); \ +\ + allocAndInitOpenMPDeviceData(x, m_x, m_nodal_array_length, did, hid); \ + allocAndInitOpenMPDeviceData(vol, m_vol, m_zonal_array_length, did, hid); \ + allocAndInitOpenMPDeviceData(real_zones, m_domain->real_zones, iend, did, hid); + +#define NODAL_ACCUMULATION_3D_DATA_TEARDOWN_OMP_TARGET \ + getOpenMPDeviceData(m_x, x, m_nodal_array_length, hid, did); \ + deallocOpenMPDeviceData(x, did); \ + deallocOpenMPDeviceData(vol, did); \ + deallocOpenMPDeviceData(real_zones, did); + + +void NODAL_ACCUMULATION_3D::runOpenMPTargetVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + NODAL_ACCUMULATION_3D_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + NODAL_ACCUMULATION_3D_DATA_SETUP_OMP_TARGET; + + NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp target is_device_ptr(x0,x1,x2,x3,x4,x5,x6,x7, \ + vol, real_zones) device( did ) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + for (Index_type ii = ibegin ; ii < iend ; ++ii ) { + NODAL_ACCUMULATION_3D_BODY_INDEX; + + Real_type val = 0.125 * vol[i]; + + #pragma omp atomic + x0[i] += val; + #pragma omp atomic + x1[i] += val; + #pragma omp atomic + x2[i] += val; + #pragma omp atomic + x3[i] += val; + #pragma omp atomic + x4[i] += val; + #pragma omp atomic + x5[i] += val; + #pragma omp atomic + x6[i] += val; + #pragma omp atomic + x7[i] += val; + } + + } + stopTimer(); + + NODAL_ACCUMULATION_3D_DATA_TEARDOWN_OMP_TARGET; + + } else if ( vid == RAJA_OpenMPTarget ) { + + NODAL_ACCUMULATION_3D_DATA_SETUP_OMP_TARGET; + + NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; + + camp::resources::Resource working_res{camp::resources::Omp()}; + RAJA::TypedListSegment zones(m_domain->real_zones, + m_domain->n_real_zones, + working_res); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>( + zones, [=](Index_type i) { + NODAL_ACCUMULATION_3D_RAJA_ATOMIC_BODY(RAJA::omp_atomic); + }); + + } + stopTimer(); + + NODAL_ACCUMULATION_3D_DATA_TEARDOWN_OMP_TARGET; + + } else { + std::cout << "\n NODAL_ACCUMULATION_3D : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp b/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp new file mode 100644 index 000000000..da26a1d68 --- /dev/null +++ b/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp @@ -0,0 +1,104 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "NODAL_ACCUMULATION_3D.hpp" + +#include "RAJA/RAJA.hpp" + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +void NODAL_ACCUMULATION_3D::runSeqVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + NODAL_ACCUMULATION_3D_DATA_SETUP; + + NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type ii = ibegin ; ii < iend ; ++ii ) { + NODAL_ACCUMULATION_3D_BODY_INDEX; + NODAL_ACCUMULATION_3D_BODY; + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto nodal_accumulation_3d_lam = [=](Index_type ii) { + NODAL_ACCUMULATION_3D_BODY_INDEX; + NODAL_ACCUMULATION_3D_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type ii = ibegin ; ii < iend ; ++ii ) { + nodal_accumulation_3d_lam(ii); + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + camp::resources::Resource working_res{camp::resources::Host()}; + RAJA::TypedListSegment zones(m_domain->real_zones, + m_domain->n_real_zones, + working_res); + + auto nodal_accumulation_3d_lam = [=](Index_type i) { + NODAL_ACCUMULATION_3D_RAJA_ATOMIC_BODY(RAJA::seq_atomic); + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall(zones, nodal_accumulation_3d_lam); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n NODAL_ACCUMULATION_3D : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/NODAL_ACCUMULATION_3D.cpp b/src/apps/NODAL_ACCUMULATION_3D.cpp new file mode 100644 index 000000000..d0bf52eeb --- /dev/null +++ b/src/apps/NODAL_ACCUMULATION_3D.cpp @@ -0,0 +1,97 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "NODAL_ACCUMULATION_3D.hpp" + +#include "RAJA/RAJA.hpp" + +#include "AppsData.hpp" +#include "common/DataUtils.hpp" + +#include + + +namespace rajaperf +{ +namespace apps +{ + + +NODAL_ACCUMULATION_3D::NODAL_ACCUMULATION_3D(const RunParams& params) + : KernelBase(rajaperf::Apps_NODAL_ACCUMULATION_3D, params) +{ + setDefaultProblemSize(100*100*100); // See rzmax in ADomain struct + setDefaultReps(100); + + Index_type rzmax = std::cbrt(getTargetProblemSize())+1; + m_domain = new ADomain(rzmax, /* ndims = */ 3); + + m_nodal_array_length = m_domain->nnalls; + m_zonal_array_length = m_domain->lpz+1; + + setActualProblemSize( m_domain->n_real_zones ); + + setItsPerRep( getActualProblemSize() ); + setKernelsPerRep(1); + // touched data size, not actual number of stores and loads + setBytesPerRep( (0*sizeof(Index_type) + 1*sizeof(Index_type)) * getItsPerRep() + + (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + + (1*sizeof(Real_type) + 1*sizeof(Real_type)) * (m_domain->imax+1 - m_domain->imin)*(m_domain->jmax+1 - m_domain->jmin)*(m_domain->kmax+1 - m_domain->kmin)); + setFLOPsPerRep(9 * getItsPerRep()); + + checksum_scale_factor = 0.001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() ); + + setUsesFeature(Forall); + setUsesFeature(Atomic); + + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( Base_OpenMPTarget ); + setVariantDefined( RAJA_OpenMPTarget ); + + setVariantDefined( Base_CUDA ); + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( Base_HIP ); + setVariantDefined( RAJA_HIP ); +} + +NODAL_ACCUMULATION_3D::~NODAL_ACCUMULATION_3D() +{ + delete m_domain; +} + +void NODAL_ACCUMULATION_3D::setUp(VariantID vid) +{ + allocAndInitDataConst(m_x, m_nodal_array_length, 0.0, vid); + allocAndInitDataConst(m_vol, m_zonal_array_length, 1.0, vid); +} + +void NODAL_ACCUMULATION_3D::updateChecksum(VariantID vid) +{ + checksum[vid] += calcChecksum(m_x, m_nodal_array_length, checksum_scale_factor ); +} + +void NODAL_ACCUMULATION_3D::tearDown(VariantID vid) +{ + (void) vid; + + deallocData(m_x); + deallocData(m_vol); +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/NODAL_ACCUMULATION_3D.hpp b/src/apps/NODAL_ACCUMULATION_3D.hpp new file mode 100644 index 000000000..43ff25703 --- /dev/null +++ b/src/apps/NODAL_ACCUMULATION_3D.hpp @@ -0,0 +1,111 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// NODAL_ACCUMULATION_3D kernel reference implementation: +/// +/// NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; +/// +/// for (Index_type ii = ibegin; ii < iend; ++ii ) { +/// Index_type i = real_zones[ii]; +/// +/// Real_type val = 0.125 * vol[i] ; +/// +/// x0[i] += val; +/// x1[i] += val; +/// x2[i] += val; +/// x3[i] += val; +/// x4[i] += val; +/// x5[i] += val; +/// x6[i] += val; +/// x7[i] += val; +/// +/// } +/// + +#ifndef RAJAPerf_Apps_NODAL_ACCUMULATION_3D_HPP +#define RAJAPerf_Apps_NODAL_ACCUMULATION_3D_HPP + +#define NODAL_ACCUMULATION_3D_DATA_SETUP \ + Real_ptr x = m_x; \ + Real_ptr vol = m_vol; \ + \ + Real_ptr x0,x1,x2,x3,x4,x5,x6,x7; \ + \ + Index_ptr real_zones = m_domain->real_zones; + +#define NODAL_ACCUMULATION_3D_BODY_INDEX \ + Index_type i = real_zones[ii]; + +#define NODAL_ACCUMULATION_3D_BODY \ + Real_type val = 0.125 * vol[i]; \ + \ + x0[i] += val; \ + x1[i] += val; \ + x2[i] += val; \ + x3[i] += val; \ + x4[i] += val; \ + x5[i] += val; \ + x6[i] += val; \ + x7[i] += val; + +#define NODAL_ACCUMULATION_3D_RAJA_ATOMIC_BODY(policy) \ + Real_type val = 0.125 * vol[i]; \ + \ + RAJA::atomicAdd(&x0[i], val); \ + RAJA::atomicAdd(&x1[i], val); \ + RAJA::atomicAdd(&x2[i], val); \ + RAJA::atomicAdd(&x3[i], val); \ + RAJA::atomicAdd(&x4[i], val); \ + RAJA::atomicAdd(&x5[i], val); \ + RAJA::atomicAdd(&x6[i], val); \ + RAJA::atomicAdd(&x7[i], val); + + + +#include "common/KernelBase.hpp" + +namespace rajaperf +{ +class RunParams; + +namespace apps +{ +class ADomain; + +class NODAL_ACCUMULATION_3D : public KernelBase +{ +public: + + NODAL_ACCUMULATION_3D(const RunParams& params); + + ~NODAL_ACCUMULATION_3D(); + + void setUp(VariantID vid); + void updateChecksum(VariantID vid); + void tearDown(VariantID vid); + + void runSeqVariant(VariantID vid); + void runOpenMPVariant(VariantID vid); + void runCudaVariant(VariantID vid); + void runHipVariant(VariantID vid); + void runOpenMPTargetVariant(VariantID vid); + +private: + Real_ptr m_x; + Real_ptr m_vol; + + ADomain* m_domain; + Index_type m_nodal_array_length; + Index_type m_zonal_array_length; +}; + +} // end namespace apps +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 41fcbd5e9..b642b768b 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -79,6 +79,7 @@ #include "apps/LTIMES.hpp" #include "apps/LTIMES_NOVIEW.hpp" #include "apps/MASS3DPA.hpp" +#include "apps/NODAL_ACCUMULATION_3D.hpp" #include "apps/PRESSURE.hpp" #include "apps/VOL3D.hpp" @@ -204,6 +205,7 @@ static const std::string KernelNames [] = std::string("Apps_LTIMES"), std::string("Apps_LTIMES_NOVIEW"), std::string("Apps_MASS3DPA"), + std::string("Apps_NODAL_ACCUMULATION_3D"), std::string("Apps_PRESSURE"), std::string("Apps_VOL3D"), @@ -644,6 +646,10 @@ KernelBase* getKernelObject(KernelID kid, kernel = new apps::MASS3DPA(run_params); break; } + case Apps_NODAL_ACCUMULATION_3D : { + kernel = new apps::NODAL_ACCUMULATION_3D(run_params); + break; + } case Apps_PRESSURE : { kernel = new apps::PRESSURE(run_params); break; diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 367aeed72..f5bbe4257 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -134,6 +134,7 @@ enum KernelID { Apps_LTIMES, Apps_LTIMES_NOVIEW, Apps_MASS3DPA, + Apps_NODAL_ACCUMULATION_3D, Apps_PRESSURE, Apps_VOL3D, From 36801c11d4eb75306d249a98ea8f02ba69118e5e Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 17 Aug 2021 11:56:05 -0700 Subject: [PATCH 065/392] change bash used in sweep_size.bash --- scripts/sweep_size.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sweep_size.sh b/scripts/sweep_size.sh index 20fe6bdfd..770b4539d 100755 --- a/scripts/sweep_size.sh +++ b/scripts/sweep_size.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash EXECUTABLES="" SIZE_MIN=10000 From 5149b3935941e16ad07d788e71a74c1d0662380c Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Wed, 18 Aug 2021 13:55:48 -0700 Subject: [PATCH 066/392] removing unnecessary flag for clang+cuda jobs, fixing lassen+cuda job flag --- scripts/gitlab/build_and_test.sh | 8 ++++---- scripts/spack_packages/raja_perf/package.py | 9 ++++++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 8da59fe53..01cbb649c 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -156,14 +156,14 @@ then if echo ${sys_type} | grep -q "blueos" && echo ${spec} | grep -q "cuda" ; then if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} then - lrun --smpiargs="-disable_gpu_hooks" ./bin/raja-perf.exe -sp + lrun -n1 --smpiargs="-disable_gpu_hooks" ./bin/raja-perf.exe -sp echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" - echo "lrun --smpiargs='-disable_gpu_hooks' /bin/raja-perf.exe -sp" + echo "lrun -n1 --smpiargs='-disable_gpu_hooks' /bin/raja-perf.exe -sp" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" else - lrun --smpiargs="-disable_gpu_hooks" ./bin/raja-perf.exe --checkrun -sp + lrun -n1 --smpiargs="-disable_gpu_hooks" ./bin/raja-perf.exe --checkrun -sp echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" - echo "lrun --smpiargs='-disable_gpu_hook' ./bin/raja-perf.exe --checkrun -sp" + echo "lrun -n1 --smpiargs='-disable_gpu_hook' ./bin/raja-perf.exe --checkrun -sp" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" fi else diff --git a/scripts/spack_packages/raja_perf/package.py b/scripts/spack_packages/raja_perf/package.py index 27fff2d8b..ec2353856 100644 --- a/scripts/spack_packages/raja_perf/package.py +++ b/scripts/spack_packages/raja_perf/package.py @@ -259,10 +259,13 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cfg.write(cmake_cache_string("BLT_CXX_STD", "c++14")) cfg.write(cmake_cache_option("ENABLE_TESTS", False)) else: - cuda_release_flags = "-O3 -Xcompiler -Ofast -Xcompiler -finline-functions -Xcompiler -finline-limit=20000" - cuda_reldebinf_flags = "-O3 -g -Xcompiler -Ofast -Xcompiler -finline-functions -Xcompiler -finline-limit=20000" - cuda_debug_flags = "-O0 -g -Xcompiler -O0 -Xcompiler -finline-functions -Xcompiler -finline-limit=20000" cfg.write(cmake_cache_option("ENABLE_TESTS", True)) + + if (not "clang" in cpp_compiler): + cuda_release_flags = "-O3 -Xcompiler -Ofast -Xcompiler -finline-functions -Xcompiler -finline-limit=20000" + cuda_reldebinf_flags = "-O3 -g -Xcompiler -Ofast -Xcompiler -finline-functions -Xcompiler -finline-limit=20000" + cuda_debug_flags = "-O0 -g -Xcompiler -O0 -Xcompiler -finline-functions -Xcompiler -finline-limit=20000" + cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELEASE", cuda_release_flags)) cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELWITHDEBINFO", cuda_reldebinf_flags)) From 6ea730324640f22234bce7708aa13bad427ad379 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Wed, 18 Aug 2021 14:32:12 -0700 Subject: [PATCH 067/392] fixing the cuda job flags again --- scripts/spack_packages/raja_perf/package.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/scripts/spack_packages/raja_perf/package.py b/scripts/spack_packages/raja_perf/package.py index ec2353856..e4a582d22 100644 --- a/scripts/spack_packages/raja_perf/package.py +++ b/scripts/spack_packages/raja_perf/package.py @@ -248,24 +248,19 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cfg.write(cmake_cache_entry("CMAKE_CUDA_COMPILER", cudacompiler)) + cuda_release_flags = "-O3" + cuda_reldebinf_flags = "-O3 -g" + cuda_debug_flags = "-O0 -g" + if ("xl" in cpp_compiler): cfg.write(cmake_cache_entry("CMAKE_CUDA_FLAGS", "-Xcompiler -O3 -Xcompiler -qxlcompatmacros -Xcompiler -qalias=noansi " + "-Xcompiler -qsmp=omp -Xcompiler -qhot -Xcompiler -qnoeh -Xcompiler -qsuppress=1500-029 " + "-Xcompiler -qsuppress=1500-036 -Xcompiler -qsuppress=1500-030")) - cuda_release_flags = "-O3" - cuda_reldebinf_flags = "-O3 -g" - cuda_debug_flags = "-O0 -g" - cfg.write(cmake_cache_string("BLT_CXX_STD", "c++14")) cfg.write(cmake_cache_option("ENABLE_TESTS", False)) else: cfg.write(cmake_cache_option("ENABLE_TESTS", True)) - if (not "clang" in cpp_compiler): - cuda_release_flags = "-O3 -Xcompiler -Ofast -Xcompiler -finline-functions -Xcompiler -finline-limit=20000" - cuda_reldebinf_flags = "-O3 -g -Xcompiler -Ofast -Xcompiler -finline-functions -Xcompiler -finline-limit=20000" - cuda_debug_flags = "-O0 -g -Xcompiler -O0 -Xcompiler -finline-functions -Xcompiler -finline-limit=20000" - cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELEASE", cuda_release_flags)) cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELWITHDEBINFO", cuda_reldebinf_flags)) From dde951d572743456c743094522a4b9e20c948c4e Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Wed, 18 Aug 2021 15:48:12 -0700 Subject: [PATCH 068/392] editing flags again for cuda+clang jobs --- scripts/spack_packages/raja_perf/package.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/scripts/spack_packages/raja_perf/package.py b/scripts/spack_packages/raja_perf/package.py index e4a582d22..8710e87fd 100644 --- a/scripts/spack_packages/raja_perf/package.py +++ b/scripts/spack_packages/raja_perf/package.py @@ -248,17 +248,21 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cfg.write(cmake_cache_entry("CMAKE_CUDA_COMPILER", cudacompiler)) - cuda_release_flags = "-O3" - cuda_reldebinf_flags = "-O3 -g" - cuda_debug_flags = "-O0 -g" if ("xl" in cpp_compiler): cfg.write(cmake_cache_entry("CMAKE_CUDA_FLAGS", "-Xcompiler -O3 -Xcompiler -qxlcompatmacros -Xcompiler -qalias=noansi " + "-Xcompiler -qsmp=omp -Xcompiler -qhot -Xcompiler -qnoeh -Xcompiler -qsuppress=1500-029 " + "-Xcompiler -qsuppress=1500-036 -Xcompiler -qsuppress=1500-030")) + cuda_release_flags = "-O3" + cuda_reldebinf_flags = "-O3 -g" + cuda_debug_flags = "-O0 -g" cfg.write(cmake_cache_string("BLT_CXX_STD", "c++14")) cfg.write(cmake_cache_option("ENABLE_TESTS", False)) else: + cuda_release_flags = "-O3 -Xcompiler -Ofast -Xcompiler -finline-functions" + cuda_reldebinf_flags = "-O3 -g -Xcompiler -Ofast -Xcompiler -finline-functions" + cuda_debug_flags = "-O0 -g -Xcompiler -O0 -Xcompiler -finline-functions" + cfg.write(cmake_cache_string("BLT_CXX_STD", "c++11")) cfg.write(cmake_cache_option("ENABLE_TESTS", True)) From b8bc43d0b5f01de514905f4b2513358cd05b5b53 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Fri, 20 Aug 2021 11:32:30 -0700 Subject: [PATCH 069/392] getting rid of clang-ibm, adding openmp-target jobs --- .gitlab/lassen-jobs.yml | 26 ++++++++++++++------- scripts/spack_packages/raja_perf/package.py | 11 +++++++++ 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index 14501c6d2..7b31e9aef 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -9,14 +9,14 @@ ## CPU ONLY ########### -ibm_clang_11: +clang_11_0_0: variables: - SPEC: "%clang@11.0.0ibm" + SPEC: "%clang@11.0.0" extends: .build_and_test_on_lassen -ibm_clang_11_gcc_8: +clang_11_gcc_8: variables: - SPEC: "%clang@11.0.0ibm cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" + SPEC: "%clang@11.0.0 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" extends: .build_and_test_on_lassen gcc_8_3_1: @@ -40,14 +40,14 @@ xl_16_1_1_7_gcc_8_3_1: ## CUDA ########### -ibm_clang_11_cuda: +clang_11_cuda: variables: - SPEC: "+cuda cuda_arch=70 %clang@11.0.0ibm ^cuda@10.1.168" + SPEC: "+cuda cuda_arch=70 %clang@11.0.0 ^cuda@10.1.168" extends: .build_and_test_on_lassen -ibm_clang_11_gcc_8_cuda: +clang_11_gcc_8_cuda: variables: - SPEC: "+cuda %clang@11.0.0ibm cuda_arch=70 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@10.1.168" + SPEC: "+cuda %clang@11.0.0 cuda_arch=70 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@10.1.168" extends: .build_and_test_on_lassen gcc_8_3_1_cuda: @@ -73,6 +73,16 @@ xl_16_1_1_7_gcc_8_3_1_cuda_11: ## EXTRAS ########### +xl_16_1_1_7_omp_target (build and test on lassen): + variables: + SPEC: "%xl@16.1.1.7+openmp+openmp_target ^cmake@3.14.5" + extends: .build_and_test_on_lassen + +clang_11_0_0_omp_target (build and test on lassen): + variables: + SPEC: "%clang@11.0.0+openmp+openmp_target ^cmake@3.14.5" + extends: .build_and_test_on_lassen + clang_11_0_0_libcpp (build and test on lassen): variables: SPEC: "%clang@11.0.0+libcpp" diff --git a/scripts/spack_packages/raja_perf/package.py b/scripts/spack_packages/raja_perf/package.py index 8710e87fd..b0652d3c5 100644 --- a/scripts/spack_packages/raja_perf/package.py +++ b/scripts/spack_packages/raja_perf/package.py @@ -76,6 +76,7 @@ class RajaPerf(CMakePackage, CudaPackage): version('0.4.0', tag='v0.4.0', submodules="True") variant('openmp', default=True, description='Build OpenMP backend') + variant('openmp_target', default=False, description='Build with OpenMP target support') variant('shared', default=False, description='Build Shared Libs') variant('libcpp', default=False, description='Uses libc++ instead of libstdc++') variant('hip', default=False, description='Build with HIP support') @@ -87,6 +88,7 @@ class RajaPerf(CMakePackage, CudaPackage): depends_on('hip', when='+hip') conflicts('+openmp', when='+hip') + conflicts('~openmp', when='+openmp_target', msg='OpenMP target requires OpenMP') phases = ['hostconfig', 'cmake', 'build', 'install'] @@ -312,6 +314,15 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): else: cfg.write(cmake_cache_option("ENABLE_HIP", False)) + cfg.write(cmake_cache_option("ENABLE_OPENMP_TARGET", "+openmp_target" in spec)) + if "+openmp_target" in spec: + if ('%xl' in spec): + cfg.write(cmake_cache_string("OpenMP_CXX_FLAGS", "-qsmp=omp;-qoffload;-qnoeh;-qalias=noansi")) + if ('%clang' in spec): + cfg.write(cmake_cache_string("OpenMP_CXX_FLAGS", "-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda")) + cfg.write(cmake_cache_option("ENABLE_CUDA", False)) + + cfg.write("#------------------{0}\n".format("-" * 60)) cfg.write("# Other\n") cfg.write("#------------------{0}\n\n".format("-" * 60)) From 60e0cec6af559e20002b20bc660e7f583c0aa825 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Fri, 20 Aug 2021 14:12:11 -0700 Subject: [PATCH 070/392] update README --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 323c6fc36..ac6d0d46c 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,7 @@ Table of Contents 2. [Running the Suite](#running-the-suite) 3. [Generated output](#generated-output) 4. [Adding kernels and variants](#adding-kernels-and-variants) +4. [Continuous Integration](#continuous-integration) 5. [Contributions](#contributions) 6. [Authors](#authors) 7. [Copyright and Release](#copyright-and-release) @@ -706,6 +707,14 @@ above. * * * +# Continuous Integration + +RAJAPerf Suite uses continuous integration to ensure that changes added to the repository are well integrated and tested for compatability with the rest of the existing code base. Our CI tests incude a variety of vetted configurations that run on different LC machines. + +RAJAPerf Suite shares its Gitlab CI workflow with other projects. The documentation is therefore [shared](https://radiuss-ci.readthedocs.io/en/latest/uberenv.html#ci). + +* * * + # Contributions The RAJA Performance Suite is a work-in-progress, with new kernels and variants From 6504a0f83f01da8a831263c9603a90602823461e Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Fri, 20 Aug 2021 15:30:59 -0700 Subject: [PATCH 071/392] update radiuss... --- scripts/radiuss-spack-configs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index a35701f59..5b70b2cbc 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit a35701f59b31976b673b312b0f7959a441087135 +Subproject commit 5b70b2cbc0b818c71d30b3afe011ab76e9c0edfa From 514f14f70642cdcb7fe130e9f6fa32ae6f306b81 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Mon, 23 Aug 2021 09:21:46 -0700 Subject: [PATCH 072/392] triggering the CI --- .gitlab/lassen-jobs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index 7b31e9aef..8554ff1ad 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -71,7 +71,7 @@ xl_16_1_1_7_gcc_8_3_1_cuda_11: ########## ## EXTRAS -########### +########## xl_16_1_1_7_omp_target (build and test on lassen): variables: From 8a2f855a6d4b5d534fa6998e37c3a1e0682e9867 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Thu, 26 Aug 2021 07:39:19 -0700 Subject: [PATCH 073/392] pulling in radiuss update --- scripts/radiuss-spack-configs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index 5b70b2cbc..56be82d3f 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit 5b70b2cbc0b818c71d30b3afe011ab76e9c0edfa +Subproject commit 56be82d3f644fef0870da3272d03b916f89c53c7 From f005d7d61e50f80b2292a4a023e60459a12e91c8 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Mon, 30 Aug 2021 14:24:34 -0700 Subject: [PATCH 074/392] updating submodules --- blt | 2 +- tpl/RAJA | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/blt b/blt index ddd5a0ca7..7ec2cb805 160000 --- a/blt +++ b/blt @@ -1 +1 @@ -Subproject commit ddd5a0ca7c566d0ae14270b66625c8a363630ddb +Subproject commit 7ec2cb80525b55d06da683a876b382472ef70661 diff --git a/tpl/RAJA b/tpl/RAJA index 357933a42..1ed927902 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 357933a42842dd91de5c1034204d937fce0a2a44 +Subproject commit 1ed927902240b072550dbe7a1a3e5ae67f125aa5 From 6bd063d8a50b6cfa2111624885edcfa479a9c628 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Mon, 30 Aug 2021 14:31:32 -0700 Subject: [PATCH 075/392] update blt --- blt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blt b/blt index 7ec2cb805..ddd5a0ca7 160000 --- a/blt +++ b/blt @@ -1 +1 @@ -Subproject commit 7ec2cb80525b55d06da683a876b382472ef70661 +Subproject commit ddd5a0ca7c566d0ae14270b66625c8a363630ddb From 0cb82a279531769a512d71a9f58dbbd4a5be728f Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Mon, 30 Aug 2021 15:45:37 -0700 Subject: [PATCH 076/392] updating some more... --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index 1ed927902..357933a42 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 1ed927902240b072550dbe7a1a3e5ae67f125aa5 +Subproject commit 357933a42842dd91de5c1034204d937fce0a2a44 From a846f8156877ec943c0cb26aa456e1e7f8e0d3f1 Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Wed, 1 Sep 2021 15:02:15 -0700 Subject: [PATCH 077/392] Turn off gcc5-debug. --- Dockerfile | 8 -------- 1 file changed, 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index 53e3c9416..4dbf8e95c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,14 +14,6 @@ RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_WARNINGS= RUN cd build && make -j 16 RUN cd build && ./bin/raja-perf.exe -sp -FROM axom/compilers:gcc-5 AS gcc5-debug -ENV GTEST_COLOR=1 -COPY --chown=axom:axom . /home/axom/workspace -WORKDIR /home/axom/workspace -RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=Debug -DENABLE_WARNINGS=On -DENABLE_COVERAGE=On -DENABLE_OPENMP=On .. -RUN cd build && make -j 16 -RUN cd build && ./bin/raja-perf.exe --checkrun -sp - FROM axom/compilers:gcc-6 AS gcc6 ENV GTEST_COLOR=1 COPY --chown=axom:axom . /home/axom/workspace From 7a48f8fb48f9680e161dbc6aefde31af65fe79db Mon Sep 17 00:00:00 2001 From: mdavis36 Date: Wed, 1 Sep 2021 15:21:28 -0700 Subject: [PATCH 078/392] Turn off gcc5-debug on azure. --- azure-pipelines.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 02d749987..9208fafa2 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -34,8 +34,6 @@ jobs: matrix: gcc5: docker_target: gcc5 - gcc5-debug: - docker_target: gcc5-debug gcc6: docker_target: gcc6 gcc7: From f82ca118ae9d59bbb77883f058704b8d5b5b7aa9 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Thu, 2 Sep 2021 08:55:50 -0700 Subject: [PATCH 079/392] updating hip job, updating raja --- .gitlab/corona-jobs.yml | 10 +++++----- tpl/RAJA | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.gitlab/corona-jobs.yml b/.gitlab/corona-jobs.yml index 4d93d3a5a..17b5b2348 100644 --- a/.gitlab/corona-jobs.yml +++ b/.gitlab/corona-jobs.yml @@ -5,11 +5,6 @@ ## SPDX-License-Identifier: (BSD-3-Clause) ############################################################################## -hip_4_0_gcc_8_1_0: - variables: - SPEC: "+hip~openmp %gcc@8.1.0 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000' ^hip@4.0.0" - extends: .build_and_test_on_corona - hip_4_1_gcc_8_1_0: variables: SPEC: "+hip~openmp %gcc@8.1.0 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000' ^hip@4.1.0" @@ -19,3 +14,8 @@ hip_4_1_clang_9_0_0: variables: SPEC: "+hip~openmp %clang@9.0.0 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 ^hip@4.1.0" extends: .build_and_test_on_corona + +hip_4_2_gcc_8_1_0: + variables: + SPEC: "+hip~openmp %gcc@8.1.0 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000' ^hip@4.2.0" + extends: .build_and_test_on_corona diff --git a/tpl/RAJA b/tpl/RAJA index 357933a42..0506cea3a 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 357933a42842dd91de5c1034204d937fce0a2a44 +Subproject commit 0506cea3aaad168de79df59a8df9fc6f27799aa3 From 849725e430f701d840df45ffc04c3a61e8b5654b Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Thu, 2 Sep 2021 15:52:45 -0700 Subject: [PATCH 080/392] adding blt flag for clang-cuda job --- scripts/spack_packages/raja_perf/package.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/spack_packages/raja_perf/package.py b/scripts/spack_packages/raja_perf/package.py index b0652d3c5..70db71fa4 100644 --- a/scripts/spack_packages/raja_perf/package.py +++ b/scripts/spack_packages/raja_perf/package.py @@ -267,7 +267,10 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cfg.write(cmake_cache_string("BLT_CXX_STD", "c++11")) cfg.write(cmake_cache_option("ENABLE_TESTS", True)) - + if ("clang" in cpp_compiler): + cfg.write(cmake_cache_string("BLT_CMAKE_IMPLICIT_LINK_DIRECTORIES_EXCLUDE", + "/usr/tce/packages/gcc/gcc-4.9.3/lib64;/usr/tce/packages/gcc/gcc-4.9.3/lib64/gcc/powerpc64le-unknown-linux-gnu/4.9.3")) + cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELEASE", cuda_release_flags)) cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELWITHDEBINFO", cuda_reldebinf_flags)) cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_DEBUG", cuda_debug_flags)) From 387227aa65f1faf849ed9eae1e72abd6772a4532 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Thu, 2 Sep 2021 20:20:14 -0500 Subject: [PATCH 081/392] inital HIP implementation of struct reduction test --- src/CMakeLists.txt | 3 + src/basic/CMakeLists.txt | 6 + src/basic/REDUCE_STRUCT-Cuda.cpp | 37 +++++ src/basic/REDUCE_STRUCT-Hip.cpp | 209 ++++++++++++++++++++++++++ src/basic/REDUCE_STRUCT-OMP.cpp | 28 ++++ src/basic/REDUCE_STRUCT-OMPTarget.cpp | 37 +++++ src/basic/REDUCE_STRUCT-Seq.cpp | 28 ++++ src/basic/REDUCE_STRUCT.cpp | 84 +++++++++++ src/basic/REDUCE_STRUCT.hpp | 113 ++++++++++++++ src/common/RAJAPerfSuite.cpp | 6 + src/common/RAJAPerfSuite.hpp | 1 + 11 files changed, 552 insertions(+) create mode 100644 src/basic/REDUCE_STRUCT-Cuda.cpp create mode 100644 src/basic/REDUCE_STRUCT-Hip.cpp create mode 100644 src/basic/REDUCE_STRUCT-OMP.cpp create mode 100644 src/basic/REDUCE_STRUCT-OMPTarget.cpp create mode 100644 src/basic/REDUCE_STRUCT-Seq.cpp create mode 100644 src/basic/REDUCE_STRUCT.cpp create mode 100644 src/basic/REDUCE_STRUCT.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 877bf5306..d1e1e8363 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -97,6 +97,9 @@ blt_add_executable( basic/REDUCE3_INT.cpp basic/REDUCE3_INT-Seq.cpp basic/REDUCE3_INT-OMPTarget.cpp + basic/REDUCE_STRUCT.cpp + basic/REDUCE_STRUCT-Seq.cpp + basic/REDUCE_STRUCT-OMPTarget.cpp basic/TRAP_INT.cpp basic/TRAP_INT-Seq.cpp basic/TRAP_INT-OMPTarget.cpp diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index 250529814..978b236f7 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -74,6 +74,12 @@ blt_add_library( REDUCE3_INT-Cuda.cpp REDUCE3_INT-OMP.cpp REDUCE3_INT-OMPTarget.cpp + REDUCE_STRUCT.cpp + REDUCE_STRUCT-Seq.cpp + REDUCE_STRUCT-Hip.cpp + REDUCE_STRUCT-Cuda.cpp + REDUCE_STRUCT-OMP.cpp + REDUCE_STRUCT-OMPTarget.cpp TRAP_INT.cpp TRAP_INT-Seq.cpp TRAP_INT-Hip.cpp diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp new file mode 100644 index 000000000..8c03fc0f6 --- /dev/null +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -0,0 +1,37 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "REDUCE_STRUCT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for CUDA execution + // + const size_t block_size = 256; + +void REDUCE_STRUCT::runCudaVariant(VariantID vid) +{ + RAJA_UNUSED_VAR(vid); +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp new file mode 100644 index 000000000..1a7b8d092 --- /dev/null +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -0,0 +1,209 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "REDUCE_STRUCT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for HIP execution + // + const size_t block_size = 256; + + +#define REDUCE_STRUCT_DATA_SETUP_HIP \ + allocAndInitHipDeviceData(particles.x, m_x, particles.N); \ + allocAndInitHipDeviceData(particles.y, m_y, particles.N); \ + +#define REDUCE_STRUCT_DATA_TEARDOWN_HIP \ + deallocHipDeviceData(particles.x); \ + deallocHipDeviceData(particles.y); \ + +__global__ void reduce_struct(Real_ptr x, Real_ptr y, + Real_ptr xcenter, Real_ptr xmin, Real_ptr xmax, + Real_ptr ycenter, Real_ptr ymin, Real_ptr ymax, + Index_type iend) +{ + + //x + HIP_DYNAMIC_SHARED( Real_type, shared) + Real_type* pxsum = (Real_type*)&shared[ 0 * blockDim.x ]; + Real_type* pxmin = (Real_type*)&shared[ 1 * blockDim.x ]; + Real_type* pxmax = (Real_type*)&shared[ 2 * blockDim.x ]; + //y + Real_type* pysum = (Real_type*)&shared[ 3 * blockDim.x ]; + Real_type* pymin = (Real_type*)&shared[ 4 * blockDim.x ]; + Real_type* pymax = (Real_type*)&shared[ 5 * blockDim.x ]; + + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + + //x + pxsum[ threadIdx.x ] = 0.0; + pxmin[ threadIdx.x ] = std::numeric_limits::max(); + pxmax[ threadIdx.x ] = std::numeric_limits::min(); + //y + pysum[ threadIdx.x ] = 0.0; + pymin[ threadIdx.x ] = std::numeric_limits::max(); + pymax[ threadIdx.x ] = std::numeric_limits::min(); + + + for ( ; i < iend ; i += gridDim.x * blockDim.x ) { + //x + pxsum[ threadIdx.x ] += x[ i ]; + pxmin[ threadIdx.x ] = RAJA_MIN( pxmin[ threadIdx.x ], x[ i ] ); + pxmax[ threadIdx.x ] = RAJA_MAX( pxmax[ threadIdx.x ], x[ i ] ); + //y + pysum[ threadIdx.x ] += y[ i ]; + pymin[ threadIdx.x ] = RAJA_MIN( pymin[ threadIdx.x ], y[ i ] ); + pymax[ threadIdx.x ] = RAJA_MAX( pymax[ threadIdx.x ], y[ i ] ); + + } + __syncthreads(); + + for ( i = blockDim.x / 2; i > 0; i /= 2 ) { + if ( threadIdx.x < i ) { + //x + pxsum[ threadIdx.x ] += pxsum[ threadIdx.x + i ]; + pxmin[ threadIdx.x ] = RAJA_MIN( pxmin[ threadIdx.x ], pxmin[ threadIdx.x + i ] ); + pxmax[ threadIdx.x ] = RAJA_MAX( pxmax[ threadIdx.x ], pxmax[ threadIdx.x + i ] ); + //y + pysum[ threadIdx.x ] += pysum[ threadIdx.x + i ]; + pymin[ threadIdx.x ] = RAJA_MIN( pymin[ threadIdx.x ], pymin[ threadIdx.x + i ] ); + pymax[ threadIdx.x ] = RAJA_MAX( pymax[ threadIdx.x ], pymax[ threadIdx.x + i ] ); + + } + __syncthreads(); + } + +// serialized access to shared data; + if ( threadIdx.x == 0 ) { + RAJA::atomicAdd( xcenter, pxsum[ 0 ] / (Real_type)iend); + RAJA::atomicMin( xmin, pxmin[ 0 ] ); + RAJA::atomicMax( xmax, pxmax[ 0 ] ); + + RAJA::atomicAdd( ycenter, pysum[ 0 ] / (Real_type)iend); + RAJA::atomicMin( ymin, pymin[ 0 ] ); + RAJA::atomicMax( ymax, pymax[ 0 ] ); + } +} + + +void REDUCE_STRUCT::runHipVariant(VariantID vid) +{ + const Index_type run_reps = 1; + const Index_type ibegin = 0; + const Index_type iend = 101; + + REDUCE_STRUCT_DATA_SETUP; + + if ( vid == Base_HIP ) { + + REDUCE_STRUCT_DATA_SETUP_HIP; + + for (int i=0;i xsum; + RAJA::ReduceMin xmin; + RAJA::ReduceMax xmax; + + RAJA::forall< RAJA::hip_exec >( + RAJA::RangeSegment(ibegin, particles.N+1), [=] __device__ (Index_type i) { + REDUCE_STRUCT_BODY_RAJA; + }); + + + particles.SetCenter(static_cast(xsum.get()/(particles.N+1)),0.0); + particles.SetXMin(static_cast(xmin.get())); + particles.SetXMax(static_cast(xmax.get())); + + //printf("x center = %f\n", particles.GetCenter()[0]); + //printf("x min = %f\n", particles.GetXMin()); + //printf("x max = %f\n", particles.GetXMax()); + + } + stopTimer(); + + REDUCE_STRUCT_DATA_TEARDOWN_HIP; + + } else { + std::cout << "\n REDUCE_STRUCT : Unknown Hip variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/basic/REDUCE_STRUCT-OMP.cpp b/src/basic/REDUCE_STRUCT-OMP.cpp new file mode 100644 index 000000000..dbafb9abd --- /dev/null +++ b/src/basic/REDUCE_STRUCT-OMP.cpp @@ -0,0 +1,28 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "REDUCE_STRUCT.hpp" + +#include "RAJA/RAJA.hpp" + +#include +#include + +namespace rajaperf +{ +namespace basic +{ + + +void REDUCE_STRUCT::runOpenMPVariant(VariantID vid) +{ + RAJA_UNUSED_VAR(vid); +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/REDUCE_STRUCT-OMPTarget.cpp b/src/basic/REDUCE_STRUCT-OMPTarget.cpp new file mode 100644 index 000000000..d01ef879e --- /dev/null +++ b/src/basic/REDUCE_STRUCT-OMPTarget.cpp @@ -0,0 +1,37 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "REDUCE_STRUCT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + +void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid) +{ + RAJA_UNUSED_VAR(vid); +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic/REDUCE_STRUCT-Seq.cpp b/src/basic/REDUCE_STRUCT-Seq.cpp new file mode 100644 index 000000000..2dd18b32c --- /dev/null +++ b/src/basic/REDUCE_STRUCT-Seq.cpp @@ -0,0 +1,28 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "REDUCE_STRUCT.hpp" + +#include "RAJA/RAJA.hpp" + +#include +#include + +namespace rajaperf +{ +namespace basic +{ + + +void REDUCE_STRUCT::runSeqVariant(VariantID vid) +{ + return; +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp new file mode 100644 index 000000000..274c79b13 --- /dev/null +++ b/src/basic/REDUCE_STRUCT.cpp @@ -0,0 +1,84 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "REDUCE_STRUCT.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/DataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +REDUCE_STRUCT::REDUCE_STRUCT(const RunParams& params) + : KernelBase(rajaperf::Basic_REDUCE_STRUCT, params) +{ + setDefaultProblemSize(100); +//setDefaultReps(5000); +// Set reps to low value until we resolve RAJA omp-target +// reduction performance issues + setDefaultReps(1); + + setActualProblemSize( 100 ); + + setItsPerRep( 1 ); + setKernelsPerRep(1); + setBytesPerRep( (6*sizeof(Real_type) + 6*sizeof(Real_type)) + + (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); + setFLOPsPerRep(1 * getActualProblemSize() + 1); + + setUsesFeature(Forall); + setUsesFeature(Reduction); + + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( Base_OpenMPTarget ); + setVariantDefined( RAJA_OpenMPTarget ); + + setVariantDefined( Base_CUDA ); + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( Base_HIP ); + setVariantDefined( RAJA_HIP ); +} + +REDUCE_STRUCT::~REDUCE_STRUCT() +{ +} + +void REDUCE_STRUCT::setUp(VariantID vid) +{ + allocAndInitData(m_x, getActualProblemSize(), vid); + allocAndInitData(m_y, getActualProblemSize(), vid); +} + +void REDUCE_STRUCT::updateChecksum(VariantID vid) +{ + return; +} + +void REDUCE_STRUCT::tearDown(VariantID vid) +{ + (void) vid; + deallocData(m_x); + deallocData(m_y); +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp new file mode 100644 index 000000000..54b34d29a --- /dev/null +++ b/src/basic/REDUCE_STRUCT.hpp @@ -0,0 +1,113 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// REDUCE_STRUCT kernel reference implementation: +/// +/// Real_type xsum = 0.0; +/// Real_type xmin = 0.0; +/// Real_type xmax = 0.0; +/// +/// for (Index_type i = ibegin; i < iend; ++i ) { +/// xsum += x[i] ; +/// xmin = RAJA_MIN(xmin, x[i]) ; +/// xmax = RAJA_MAX(xmax, x[i]) ; +/// } +/// +/// particles.xcenter += xsum; +/// particles.xcenter /= particles.N +/// particles.xmin = RAJA_MIN(m_xmin, xmin); +/// particles.xmax = RAJA_MAX(m_xmax, xmax); +/// +/// RAJA_MIN/MAX are macros that do what you would expect. +/// + +#ifndef RAJAPerf_Basic_REDUCE_STRUCT_HPP +#define RAJAPerf_Basic_REDUCE_STRUCT_HPP + + +#define REDUCE_STRUCT_DATA_SETUP \ + particles_t particles; \ + particles.N = 100; \ + Real_type X_MIN = 0.0, X_MAX = 100.0; \ + Real_type Y_MIN = 0.0, Y_MAX = 50.0; \ + Real_type Lx = (X_MAX) - (X_MIN); \ + Real_type Ly = (Y_MAX) - (Y_MIN); \ + Real_type dx = Lx/(Real_type)(particles.N); \ + Real_type dy = Ly/(Real_type)(particles.N); \ + Real_type DX = dx*(particles.N-1); \ + Real_type DY = dy*(particles.N-1); + +#define REDUCE_STRUCT_BODY \ + particles.xcenter += particles.x[i] ; \ + particles.xcenter /= particles.N \ + particles.xmin = RAJA_MIN(particles.xmin, particles.x[i]) ; \ + particles.xmax = RAJA_MAX(particles.xmax, particles.x[i]) ; + +#define REDUCE_STRUCT_BODY_RAJA \ + xsum += particles.x[i] ; \ + xmin.min(particles.x[i]) ; \ + xmax.max(particles.x[i]) ; + + +#include "common/KernelBase.hpp" + +namespace rajaperf +{ +class RunParams; + +namespace basic +{ + +class REDUCE_STRUCT : public KernelBase +{ +public: + + REDUCE_STRUCT(const RunParams& params); + + ~REDUCE_STRUCT(); + + void setUp(VariantID vid); + void updateChecksum(VariantID vid); + void tearDown(VariantID vid); + + void runSeqVariant(VariantID vid); + void runOpenMPVariant(VariantID vid); + void runCudaVariant(VariantID vid); + void runHipVariant(VariantID vid); + void runOpenMPTargetVariant(VariantID vid); + +private: + Real_ptr m_x; Real_ptr m_y; + + struct particles_t{ + Int_type N; + Real_ptr x, y; + + Real_ptr GetCenter(){return ¢er[0];}; + Real_type GetXMax(){return xmax;}; + Real_type GetXMin(){return xmin;}; + Real_type GetYMax(){return ymax;}; + Real_type GetYMin(){return ymin;}; + void SetCenter(Real_type xval, Real_type yval){this->center[0]=xval, this->center[1]=yval;}; + void SetXMin(Real_type val){this->xmin=val;}; + void SetXMax(Real_type val){this->xmax=val;}; + void SetYMin(Real_type val){this->ymin=val;}; + void SetYMax(Real_type val){this->ymax=val;}; + //results + private: + Real_type center[2]; + Real_type xmin, xmax; + Real_type ymin, ymax; + }; +}; + +} // end namespace basic +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 41fcbd5e9..8ae39c65c 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -24,6 +24,7 @@ #include "basic/PI_ATOMIC.hpp" #include "basic/PI_REDUCE.hpp" #include "basic/REDUCE3_INT.hpp" +#include "basic/REDUCE_STRUCT.hpp" #include "basic/TRAP_INT.hpp" // @@ -149,6 +150,7 @@ static const std::string KernelNames [] = std::string("Basic_PI_ATOMIC"), std::string("Basic_PI_REDUCE"), std::string("Basic_REDUCE3_INT"), + std::string("Basic_REDUCE_STRUCT"), std::string("Basic_TRAP_INT"), // @@ -472,6 +474,10 @@ KernelBase* getKernelObject(KernelID kid, kernel = new basic::REDUCE3_INT(run_params); break; } + case Basic_REDUCE_STRUCT : { + kernel = new basic::REDUCE_STRUCT(run_params); + break; + } case Basic_TRAP_INT : { kernel = new basic::TRAP_INT(run_params); break; diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 367aeed72..9d3a87c7a 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -79,6 +79,7 @@ enum KernelID { Basic_PI_ATOMIC, Basic_PI_REDUCE, Basic_REDUCE3_INT, + Basic_REDUCE_STRUCT, Basic_TRAP_INT, // From 1d764783a229297b750b3c4229651c714a950538 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Fri, 3 Sep 2021 11:13:47 -0500 Subject: [PATCH 082/392] adding struct reduction test cuda variant --- src/basic/REDUCE_STRUCT-Cuda.cpp | 164 ++++++++++++++++++++++++++++++- src/basic/REDUCE_STRUCT-Hip.cpp | 20 +--- src/basic/REDUCE_STRUCT.hpp | 19 ++-- 3 files changed, 177 insertions(+), 26 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 8c03fc0f6..ccc064934 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -22,13 +22,173 @@ namespace basic { // - // Define thread block size for CUDA execution + // Define thread block size for Cuda execution // const size_t block_size = 256; + +#define REDUCE_STRUCT_DATA_SETUP_CUDA \ + allocAndInitCudaDeviceData(particles.x, m_x, particles.N); \ + allocAndInitCudaDeviceData(particles.y, m_y, particles.N); \ + +#define REDUCE_STRUCT_DATA_TEARDOWN_CUDA \ + deallocCudaDeviceData(particles.x); \ + deallocCudaDeviceData(particles.y); \ + +__global__ void reduce_struct(Real_ptr x, Real_ptr y, + Real_ptr xcenter, Real_ptr xmin, Real_ptr xmax, + Real_ptr ycenter, Real_ptr ymin, Real_ptr ymax, + Index_type iend) +{ + + //x + extern __shared__ Real_type shared[]; + Real_type* pxsum = (Real_type*)&shared[ 0 * blockDim.x ]; + Real_type* pxmin = (Real_type*)&shared[ 1 * blockDim.x ]; + Real_type* pxmax = (Real_type*)&shared[ 2 * blockDim.x ]; + //y + Real_type* pysum = (Real_type*)&shared[ 3 * blockDim.x ]; + Real_type* pymin = (Real_type*)&shared[ 4 * blockDim.x ]; + Real_type* pymax = (Real_type*)&shared[ 5 * blockDim.x ]; + + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + + //x + pxsum[ threadIdx.x ] = 0.0; + pxmin[ threadIdx.x ] = std::numeric_limits::max(); + pxmax[ threadIdx.x ] = std::numeric_limits::min(); + //y + pysum[ threadIdx.x ] = 0.0; + pymin[ threadIdx.x ] = std::numeric_limits::max(); + pymax[ threadIdx.x ] = std::numeric_limits::min(); + + + for ( ; i < iend ; i += gridDim.x * blockDim.x ) { + //x + pxsum[ threadIdx.x ] += x[ i ]; + pxmin[ threadIdx.x ] = RAJA_MIN( pxmin[ threadIdx.x ], x[ i ] ); + pxmax[ threadIdx.x ] = RAJA_MAX( pxmax[ threadIdx.x ], x[ i ] ); + //y + pysum[ threadIdx.x ] += y[ i ]; + pymin[ threadIdx.x ] = RAJA_MIN( pymin[ threadIdx.x ], y[ i ] ); + pymax[ threadIdx.x ] = RAJA_MAX( pymax[ threadIdx.x ], y[ i ] ); + + } + __syncthreads(); + + for ( i = blockDim.x / 2; i > 0; i /= 2 ) { + if ( threadIdx.x < i ) { + //x + pxsum[ threadIdx.x ] += pxsum[ threadIdx.x + i ]; + pxmin[ threadIdx.x ] = RAJA_MIN( pxmin[ threadIdx.x ], pxmin[ threadIdx.x + i ] ); + pxmax[ threadIdx.x ] = RAJA_MAX( pxmax[ threadIdx.x ], pxmax[ threadIdx.x + i ] ); + //y + pysum[ threadIdx.x ] += pysum[ threadIdx.x + i ]; + pymin[ threadIdx.x ] = RAJA_MIN( pymin[ threadIdx.x ], pymin[ threadIdx.x + i ] ); + pymax[ threadIdx.x ] = RAJA_MAX( pymax[ threadIdx.x ], pymax[ threadIdx.x + i ] ); + + } + __syncthreads(); + } + +// serialized access to shared data; + if ( threadIdx.x == 0 ) { + RAJA::atomicAdd( xcenter, pxsum[ 0 ] / (Real_type)iend); + RAJA::atomicMin( xmin, pxmin[ 0 ] ); + RAJA::atomicMax( xmax, pxmax[ 0 ] ); + + RAJA::atomicAdd( ycenter, pysum[ 0 ] / (Real_type)iend); + RAJA::atomicMin( ymin, pymin[ 0 ] ); + RAJA::atomicMax( ymax, pymax[ 0 ] ); + } +} + + void REDUCE_STRUCT::runCudaVariant(VariantID vid) { - RAJA_UNUSED_VAR(vid); + const Index_type run_reps = 1; + const Index_type ibegin = 0; + const Index_type iend = 101; + + REDUCE_STRUCT_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + REDUCE_STRUCT_DATA_SETUP_CUDA; + + //for (int i=0;i>> + (particles.x, particles.y, + mem, mem+1,mem+2, //xcenter,xmin,xmax + mem+3,mem+4,mem+5, //ycenter,ymin,ymax + particles.N+1); + cudaErrchk( cudaGetLastError() ); + + Real_type lmem[6]; + Real_ptr plmem = &lmem[0]; + getCudaDeviceData(plmem, mem, 6); + + particles.SetCenter(lmem[0],lmem[3]); + particles.SetXMin(lmem[1]); + particles.SetXMax(lmem[2]); + particles.SetYMin(lmem[4]); + particles.SetYMax(lmem[5]); + + } + stopTimer(); + + REDUCE_STRUCT_DATA_TEARDOWN_CUDA; + + deallocCudaDeviceData(mem); + + } else if ( vid == RAJA_CUDA ) { + + REDUCE_STRUCT_DATA_SETUP_CUDA; + + for (int i=0;i xsum; + RAJA::ReduceMin xmin; + RAJA::ReduceMax xmax; + + RAJA::forall< RAJA::cuda_exec >( + RAJA::RangeSegment(ibegin, particles.N+1), [=] __device__ (Index_type i) { + REDUCE_STRUCT_BODY_RAJA; + }); + + + particles.SetCenter(static_cast(xsum.get()/(particles.N+1)),0.0); + particles.SetXMin(static_cast(xmin.get())); + particles.SetXMax(static_cast(xmax.get())); + + } + stopTimer(); + + REDUCE_STRUCT_DATA_TEARDOWN_CUDA; + + } else { + std::cout << "\n REDUCE_STRUCT : Unknown CUDA variant id = " << vid << std::endl; + } } } // end namespace basic diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index 1a7b8d092..4873efb10 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -116,10 +116,10 @@ void REDUCE_STRUCT::runHipVariant(VariantID vid) REDUCE_STRUCT_DATA_SETUP_HIP; - for (int i=0;i(xmin.get())); particles.SetXMax(static_cast(xmax.get())); - //printf("x center = %f\n", particles.GetCenter()[0]); - //printf("x min = %f\n", particles.GetXMin()); - //printf("x max = %f\n", particles.GetXMax()); - } stopTimer(); diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index 54b34d29a..6858cfda1 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -21,8 +21,8 @@ /// /// particles.xcenter += xsum; /// particles.xcenter /= particles.N -/// particles.xmin = RAJA_MIN(m_xmin, xmin); -/// particles.xmax = RAJA_MAX(m_xmax, xmax); +/// particles.xmin = xmin; +/// particles.xmax = xmax; /// /// RAJA_MIN/MAX are macros that do what you would expect. /// @@ -41,13 +41,16 @@ Real_type dx = Lx/(Real_type)(particles.N); \ Real_type dy = Ly/(Real_type)(particles.N); \ Real_type DX = dx*(particles.N-1); \ - Real_type DY = dy*(particles.N-1); + Real_type DY = dy*(particles.N-1); \ + for (int i=0;iymax=val;}; //results private: - Real_type center[2]; + Real_type center[2] = {0.0,0.0}; Real_type xmin, xmax; Real_type ymin, ymax; }; From 83971c39e4c7e40d876bc9a881a29614c0a33689 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Fri, 3 Sep 2021 21:28:10 -0500 Subject: [PATCH 083/392] adding x direction sequential --- src/basic/REDUCE_STRUCT-Cuda.cpp | 8 +-- src/basic/REDUCE_STRUCT-Hip.cpp | 8 +-- src/basic/REDUCE_STRUCT-Seq.cpp | 102 ++++++++++++++++++++++++++++++- src/basic/REDUCE_STRUCT.hpp | 7 +-- 4 files changed, 112 insertions(+), 13 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index ccc064934..47e03a96c 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -116,10 +116,10 @@ void REDUCE_STRUCT::runCudaVariant(VariantID vid) REDUCE_STRUCT_DATA_SETUP_CUDA; - //for (int i=0;i Real_type { + return particles.x[i]; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type xsum = 0.0; + Real_type xmin = 0.0; + Real_type xmax = 0.0; + + for (Index_type i = ibegin; i < iend; ++i ) { + xsum += init_struct_base_lam(i); + xmin = RAJA_MIN(xmin, init_struct_base_lam(i)); + xmax = RAJA_MAX(xmax, init_struct_base_lam(i)); + } + particles.SetCenter(xsum/particles.N,0.0); + particles.SetXMin(xmin); + particles.SetXMax(xmax); + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + //startTimer(); + //for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + // RAJA::ReduceSum vsum(m_vsum_init); + // RAJA::ReduceMin vmin(m_vmin_init); + // RAJA::ReduceMax vmax(m_vmax_init); + + // RAJA::forall( + // RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + // REDUCE3_INT_BODY_RAJA; + // }); + + // m_vsum += static_cast(vsum.get()); + // m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); + // m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + + //} + //stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n REDUCE_STRUCT : Unknown variant id = " << vid << std::endl; + } + + } + } } // end namespace basic diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index 6858cfda1..775ab282d 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -42,10 +42,8 @@ Real_type dy = Ly/(Real_type)(particles.N); \ Real_type DX = dx*(particles.N-1); \ Real_type DY = dy*(particles.N-1); \ - for (int i=0;i Date: Sat, 4 Sep 2021 13:32:52 -0500 Subject: [PATCH 084/392] adding x direction to reduce struct test omp variants --- src/basic/REDUCE_STRUCT-OMP.cpp | 98 +++++++++++++++++++++++++++ src/basic/REDUCE_STRUCT-OMPTarget.cpp | 78 ++++++++++++++++++++- src/basic/REDUCE_STRUCT-Seq.cpp | 29 ++++---- 3 files changed, 189 insertions(+), 16 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-OMP.cpp b/src/basic/REDUCE_STRUCT-OMP.cpp index dbafb9abd..3c9385724 100644 --- a/src/basic/REDUCE_STRUCT-OMP.cpp +++ b/src/basic/REDUCE_STRUCT-OMP.cpp @@ -21,7 +21,105 @@ namespace basic void REDUCE_STRUCT::runOpenMPVariant(VariantID vid) { +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + REDUCE_STRUCT_DATA_SETUP; + + switch ( vid ) { + + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type xsum = 0.0; + Real_type xmin = 0.0; + Real_type xmax = 0.0; + + #pragma omp parallel for reduction(+:xsum), \ + reduction(min:xmin), \ + reduction(max:xmax) + for (Index_type i = ibegin; i < iend; ++i ) { + REDUCE_STRUCT_BODY; + } + particles.SetCenter(xsum/particles.N,0.0); + particles.SetXMin(xmin); + particles.SetXMax(xmax); + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP : { + + auto reduce_struct_base_lam = [=](Index_type i) -> Real_type { + return vec[i]; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type xsum = 0.0; + Real_type xmin = 0.0; + Real_type xmax = 0.0; + + #pragma omp parallel for reduction(+:xsum), \ + reduction(min:xmin), \ + reduction(max:xmax) + + for (Index_type i = ibegin; i < iend; ++i ) { + xsum += init_struct_base_lam(i); + xmin = RAJA_MIN(xmin, init_struct_base_lam(i)); + xmax = RAJA_MAX(xmax, init_struct_base_lam(i)); + } + particles.SetCenter(xsum/particles.N,0.0); + particles.SetXMin(xmin); + particles.SetXMax(xmax); + + stopTimer(); + + break; + } + + case RAJA_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum xsum; + RAJA::ReduceMin xmin; + RAJA::ReduceMax xmax; + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + REDUCE_REAL_BODY_RAJA; + }); + + + particles.SetCenter(static_cast(xsum.get()/(particles.N+1)),0.0); + particles.SetXMin(static_cast(xmin.get())); + particles.SetXMax(static_cast(xmax.get())); + } + stopTimer(); + + break; + } + + default : { + std::cout << "\n REDUCE_STRUCT : Unknown variant id = " << vid << std::endl; + } + + } + +#else RAJA_UNUSED_VAR(vid); +#endif } } // end namespace basic diff --git a/src/basic/REDUCE_STRUCT-OMPTarget.cpp b/src/basic/REDUCE_STRUCT-OMPTarget.cpp index d01ef879e..e6031fdcf 100644 --- a/src/basic/REDUCE_STRUCT-OMPTarget.cpp +++ b/src/basic/REDUCE_STRUCT-OMPTarget.cpp @@ -26,9 +26,85 @@ namespace basic // const size_t threads_per_team = 256; +#define REDUCE_STRUCT_DATA_SETUP_OMP_TARGET \ + int hid = omp_get_initial_device(); \ + int did = omp_get_default_device(); \ +\ + allocAndInitHipDeviceData(particles.x, m_x, particles.N, did, hid); \ + allocAndInitHipDeviceData(particles.y, m_y, particles.N, did, hid); + +#define REDUCE_STRUCT_DATA_TEARDOWN_OMP_TARGET \ + deallocHipDeviceData(particles.x); \ + deallocHipDeviceData(particles.y); \ + + void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid) { - RAJA_UNUSED_VAR(vid); + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + REDUCE_STRUCT_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + REDUCE_STRUCT_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type xsum = 0.0; + Real_type xmin = 0.0; + Real_type xmax = 0.0; + + #pragma omp target is_device_ptr(vec) device( did ) map(tofrom:vsum, vmin, vmax) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static,1) \ + reduction(+:xsum) \ + reduction(min:xmin) \ + reduction(max:xmax) + for (Index_type i = ibegin; i < iend; ++i ) { + REDUCE_STRUCT_BODY; + } + + particles.SetCenter(xsum/particles.N,0.0); + particles.SetXMin(xmin); + particles.SetXMax(xmax); + + + } + stopTimer(); + + REDUCE_STRUCT_DATA_TEARDOWN_OMP_TARGET; + + } else if ( vid == RAJA_OpenMPTarget ) { + + REDUCE_STRUCT_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum xsum; + RAJA::ReduceMin xmin; + RAJA::ReduceMax xmax; + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + REDUCE_STRUCT_BODY_RAJA; + }); + + particles.SetCenter(static_cast(xsum.get()/(particles.N+1)),0.0); + particles.SetXMin(static_cast(xmin.get())); + particles.SetXMax(static_cast(xmax.get())); + + } + stopTimer(); + + REDUCE_STRUCT_DATA_TEARDOWN_OMP_TARGET; + + } else { + std::cout << "\n REDUCE_STRUCT : Unknown OMP Target variant id = " << vid << std::endl; + } } } // end namespace basic diff --git a/src/basic/REDUCE_STRUCT-Seq.cpp b/src/basic/REDUCE_STRUCT-Seq.cpp index ce83675b1..f23e52511 100644 --- a/src/basic/REDUCE_STRUCT-Seq.cpp +++ b/src/basic/REDUCE_STRUCT-Seq.cpp @@ -93,24 +93,23 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid) case RAJA_Seq : { - //startTimer(); - //for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - // RAJA::ReduceSum vsum(m_vsum_init); - // RAJA::ReduceMin vmin(m_vmin_init); - // RAJA::ReduceMax vmax(m_vmax_init); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - // RAJA::forall( - // RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - // REDUCE3_INT_BODY_RAJA; - // }); + RAJA::ReduceSum xsum; + RAJA::ReduceMin xmin; + RAJA::ReduceMax xmax; - // m_vsum += static_cast(vsum.get()); - // m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); - // m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + REDUCE_STRUCT_BODY_RAJA; + }); - //} - //stopTimer(); + particles.SetCenter(static_cast(xsum.get()/(particles.N+1)),0.0); + particles.SetXMin(static_cast(xmin.get())); + particles.SetXMax(static_cast(xmax.get())); + } + stopTimer(); break; } From 0e91c56ab3858275b3110fcfc15f09d42b210440 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Tue, 7 Sep 2021 12:32:08 -0500 Subject: [PATCH 085/392] cleaning up struct reduction test --- src/basic/REDUCE_STRUCT.cpp | 15 +++++++++++---- src/basic/REDUCE_STRUCT.hpp | 37 +++++++++++++++++++++++++++---------- 2 files changed, 38 insertions(+), 14 deletions(-) diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp index 274c79b13..ff69bdfd5 100644 --- a/src/basic/REDUCE_STRUCT.cpp +++ b/src/basic/REDUCE_STRUCT.cpp @@ -23,18 +23,18 @@ namespace basic REDUCE_STRUCT::REDUCE_STRUCT(const RunParams& params) : KernelBase(rajaperf::Basic_REDUCE_STRUCT, params) { - setDefaultProblemSize(100); + setDefaultProblemSize(101); //setDefaultReps(5000); // Set reps to low value until we resolve RAJA omp-target // reduction performance issues setDefaultReps(1); - setActualProblemSize( 100 ); + setActualProblemSize( 101 ); +// setActualProblemSize( getTargetProblemSize() ); setItsPerRep( 1 ); setKernelsPerRep(1); - setBytesPerRep( (6*sizeof(Real_type) + 6*sizeof(Real_type)) + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); + setBytesPerRep( 6*sizeof(Real_type) + getActualProblemSize()); setFLOPsPerRep(1 * getActualProblemSize() + 1); setUsesFeature(Forall); @@ -70,6 +70,13 @@ void REDUCE_STRUCT::setUp(VariantID vid) void REDUCE_STRUCT::updateChecksum(VariantID vid) { + checksum[vid] += m_particles.GetCenter()[0]; + checksum[vid] += m_particles.GetXMin(); + checksum[vid] += m_particles.GetXMax(); + checksum[vid] += m_particles.GetCenter()[1]; + checksum[vid] += m_particles.GetYMin(); + checksum[vid] += m_particles.GetYMax(); + return; } diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index 775ab282d..cefe5cfa7 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -14,15 +14,20 @@ /// Real_type xmax = 0.0; /// /// for (Index_type i = ibegin; i < iend; ++i ) { -/// xsum += x[i] ; -/// xmin = RAJA_MIN(xmin, x[i]) ; -/// xmax = RAJA_MAX(xmax, x[i]) ; +/// xsum += x[i] ; ysum += y[i] ; +/// xmin = RAJA_MIN(xmin, x[i]) ; xmax = RAJA_MAX(xmax, x[i]) ; +/// ymin = RAJA_MIN(ymin, y[i]) ; ymax = RAJA_MAX(ymax, y[i]) ; /// } /// /// particles.xcenter += xsum; /// particles.xcenter /= particles.N /// particles.xmin = xmin; /// particles.xmax = xmax; +/// particles.ycenter += ysum; +/// particles.ycenter /= particles.N +/// particles.ymin = ymin; +/// particles.ymax = ymax; + /// /// RAJA_MIN/MAX are macros that do what you would expect. /// @@ -33,7 +38,7 @@ #define REDUCE_STRUCT_DATA_SETUP \ particles_t particles; \ - particles.N = 100; \ + particles.N = getActualProblemSize(); \ Real_type X_MIN = 0.0, X_MAX = 100.0; \ Real_type Y_MIN = 0.0, Y_MAX = 50.0; \ Real_type Lx = (X_MAX) - (X_MIN); \ @@ -43,18 +48,27 @@ Real_type DX = dx*(particles.N-1); \ Real_type DY = dy*(particles.N-1); \ particles.x = m_x; \ - particles.y = m_y; + particles.y = m_y; \ + for (int i=0;ixmin=val;}; void SetXMax(Real_type val){this->xmax=val;}; void SetYMin(Real_type val){this->ymin=val;}; - void SetYMax(Real_type val){this->ymax=val;}; + void SetYMax(Real_type val){this->ymax=val;}; + //results private: Real_type center[2] = {0.0,0.0}; Real_type xmin, xmax; Real_type ymin, ymax; }; + + Real_ptr m_x; Real_ptr m_y; + particles_t m_particles; }; } // end namespace basic From 3c81331743c58fd21762bf790dfc3d36d83dcc2b Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Tue, 7 Sep 2021 12:48:01 -0500 Subject: [PATCH 086/392] fixing bug in cuda reduction struct test --- src/basic/REDUCE_STRUCT-Cuda.cpp | 39 ++++++++++++-------------------- 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 47e03a96c..453ed8532 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -30,6 +30,10 @@ namespace basic #define REDUCE_STRUCT_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(particles.x, m_x, particles.N); \ allocAndInitCudaDeviceData(particles.y, m_y, particles.N); \ + for (int i=0;i>> (particles.x, particles.y, @@ -146,7 +144,7 @@ void REDUCE_STRUCT::runCudaVariant(VariantID vid) particles.SetXMax(lmem[2]); particles.SetYMin(lmem[4]); particles.SetYMax(lmem[5]); - + m_particles=particles; } stopTimer(); @@ -158,29 +156,22 @@ void REDUCE_STRUCT::runCudaVariant(VariantID vid) REDUCE_STRUCT_DATA_SETUP_CUDA; - for (int i=0;i xsum; - RAJA::ReduceMin xmin; - RAJA::ReduceMax xmax; + RAJA::ReduceSum xsum, ysum; + RAJA::ReduceMin xmin, ymin; + RAJA::ReduceMax xmax, ymax; RAJA::forall< RAJA::cuda_exec >( - RAJA::RangeSegment(ibegin, particles.N+1), [=] __device__ (Index_type i) { + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { REDUCE_STRUCT_BODY_RAJA; }); - - particles.SetCenter(static_cast(xsum.get()/(particles.N+1)),0.0); - particles.SetXMin(static_cast(xmin.get())); - particles.SetXMax(static_cast(xmax.get())); - + particles.SetCenter(static_cast(xsum.get()/(particles.N)),ysum.get()/(particles.N)); + particles.SetXMin(static_cast(xmin.get())); particles.SetXMax(static_cast(xmax.get())); + particles.SetYMin(static_cast(ymin.get())); particles.SetYMax(static_cast(ymax.get())); + m_particles=particles; } stopTimer(); From f26c6c4aed378e96052604f5e0c027e46ad963fc Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Tue, 7 Sep 2021 13:06:14 -0700 Subject: [PATCH 087/392] use threadIdx.x rather than legacy HIP thread ids --- src/apps/MASS3DPA-Hip.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index a53fe3cf5..015b9ea5e 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -42,7 +42,7 @@ namespace apps { #define RAJA_UNROLL(N) #endif #define FOREACH_THREAD(i, k, N) \ - for(int i=hipThreadIdx_ ##k; i Date: Tue, 7 Sep 2021 15:51:30 -0500 Subject: [PATCH 088/392] updating rep count and fixing bug in reduce struct test --- src/basic/REDUCE_STRUCT-Cuda.cpp | 8 +-- src/basic/REDUCE_STRUCT-Hip.cpp | 41 +++++++--------- src/basic/REDUCE_STRUCT-OMP.cpp | 71 ++++++++++++++++----------- src/basic/REDUCE_STRUCT-OMPTarget.cpp | 36 +++++++------- src/basic/REDUCE_STRUCT-Seq.cpp | 69 ++++++++++++-------------- src/basic/REDUCE_STRUCT.cpp | 4 +- src/basic/REDUCE_STRUCT.hpp | 4 +- 7 files changed, 116 insertions(+), 117 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 453ed8532..fea9b09a9 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -132,7 +132,7 @@ void REDUCE_STRUCT::runCudaVariant(VariantID vid) (particles.x, particles.y, mem, mem+1,mem+2, //xcenter,xmin,xmax mem+3,mem+4,mem+5, //ycenter,ymin,ymax - particles.N+1); + particles.N); cudaErrchk( cudaGetLastError() ); Real_type lmem[6]; @@ -159,9 +159,9 @@ void REDUCE_STRUCT::runCudaVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum xsum, ysum; - RAJA::ReduceMin xmin, ymin; - RAJA::ReduceMax xmax, ymax; + RAJA::ReduceSum xsum=0.0, ysum=0.0; + RAJA::ReduceMin xmin=0.0, ymin=0.0; + RAJA::ReduceMax xmax=0.0, ymax=0.0; RAJA::forall< RAJA::cuda_exec >( RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index ecae364c9..9ff7cd22b 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -30,6 +30,10 @@ namespace basic #define REDUCE_STRUCT_DATA_SETUP_HIP \ allocAndInitHipDeviceData(particles.x, m_x, particles.N); \ allocAndInitHipDeviceData(particles.y, m_y, particles.N); \ + for (int i=0;i xsum; - RAJA::ReduceMin xmin; - RAJA::ReduceMax xmax; + RAJA::ReduceSum xsum(0.0), ysum(0.0); + RAJA::ReduceMin xmin(0.0), ymin(0.0); + RAJA::ReduceMax xmax(0.0), ymax(0.0); RAJA::forall< RAJA::hip_exec >( - RAJA::RangeSegment(ibegin, particles.N+1), [=] __device__ (Index_type i) { + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { REDUCE_STRUCT_BODY_RAJA; }); - particles.SetCenter(static_cast(xsum.get()/(particles.N+1)),0.0); - particles.SetXMin(static_cast(xmin.get())); - particles.SetXMax(static_cast(xmax.get())); + particles.SetCenter(static_cast(xsum.get()/(particles.N)),ysum.get()/(particles.N)); + particles.SetXMin(static_cast(xmin.get())); particles.SetXMax(static_cast(xmax.get())); + particles.SetYMin(static_cast(ymin.get())); particles.SetYMax(static_cast(ymax.get())); + m_particles=particles; } stopTimer(); diff --git a/src/basic/REDUCE_STRUCT-OMP.cpp b/src/basic/REDUCE_STRUCT-OMP.cpp index 3c9385724..27b6b52f3 100644 --- a/src/basic/REDUCE_STRUCT-OMP.cpp +++ b/src/basic/REDUCE_STRUCT-OMP.cpp @@ -36,20 +36,23 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Real_type xsum = 0.0; - Real_type xmin = 0.0; - Real_type xmax = 0.0; + Real_type xsum = 0.0; Real_type ysum = 0.0; + Real_type xmin = 0.0; Real_type ymin = 0.0; + Real_type xmax = 0.0; Real_type ymax = 0.0; #pragma omp parallel for reduction(+:xsum), \ reduction(min:xmin), \ - reduction(max:xmax) + reduction(max:xmax), \ + reduction(+:ysum), \ + reduction(min:ymin), \ + reduction(max:ymax) for (Index_type i = ibegin; i < iend; ++i ) { REDUCE_STRUCT_BODY; } - particles.SetCenter(xsum/particles.N,0.0); - particles.SetXMin(xmin); - particles.SetXMax(xmax); - + particles.SetCenter(xsum/particles.N,ysum/particles.N); + particles.SetXMin(xmin); particles.SetXMax(xmax); + particles.SetYMin(ymin); particles.SetYMax(ymax); + m_particles=particles; } stopTimer(); @@ -58,30 +61,38 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid) case Lambda_OpenMP : { - auto reduce_struct_base_lam = [=](Index_type i) -> Real_type { - return vec[i]; + auto reduce_struct_x_base_lam = [=](Index_type i) -> Real_type { + return particles.x[i]; + }; + auto reduce_struct_y_base_lam = [=](Index_type i) -> Real_type { + return particles.y[i]; }; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Real_type xsum = 0.0; - Real_type xmin = 0.0; - Real_type xmax = 0.0; - + Real_type xsum = 0.0, ysum = 0.0; + Real_type xmin = 0.0, ymin = 0.0; + Real_type xmax = 0.0, ymax = 0.0; #pragma omp parallel for reduction(+:xsum), \ reduction(min:xmin), \ - reduction(max:xmax) + reduction(max:xmax), \ + reduction(+:ysum), \ + reduction(min:ymin), \ + reduction(max:ymax) for (Index_type i = ibegin; i < iend; ++i ) { - xsum += init_struct_base_lam(i); - xmin = RAJA_MIN(xmin, init_struct_base_lam(i)); - xmax = RAJA_MAX(xmax, init_struct_base_lam(i)); - } - particles.SetCenter(xsum/particles.N,0.0); - particles.SetXMin(xmin); - particles.SetXMax(xmax); + xsum += init_struct_x_base_lam(i); + xmin = RAJA_MIN(xmin, init_struct_x_base_lam(i)); + xmax = RAJA_MAX(xmax, init_struct_x_base_lam(i)); + ysum += init_struct_y_base_lam(i); + ymin = RAJA_MIN(ymin, init_struct_y_base_lam(i)); + ymax = RAJA_MAX(ymax, init_struct_y_base_lam(i)); + } + particles.SetCenter(xsum/particles.N,ysum/particles.N); + particles.SetXMin(xmin); particles.SetXMax(xmax); + particles.SetYMin(ymin); particles.SetYMax(ymax); + m_particles=particles; stopTimer(); break; @@ -92,19 +103,19 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum xsum; - RAJA::ReduceMin xmin; - RAJA::ReduceMax xmax; + RAJA::ReduceSum xsum(0.0), ysum(0.0); + RAJA::ReduceMin xmin(0.0), ymin(0.0); + RAJA::ReduceMax xmax(0.0), ymax(0.0); RAJA::forall( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { REDUCE_REAL_BODY_RAJA; }); - - particles.SetCenter(static_cast(xsum.get()/(particles.N+1)),0.0); - particles.SetXMin(static_cast(xmin.get())); - particles.SetXMax(static_cast(xmax.get())); + particles.SetCenter(static_cast(xsum.get()/(particles.N)),ysum.get()/(particles.N)); + particles.SetXMin(static_cast(xmin.get())); particles.SetYMin(static_cast(xmax.get())); + particles.SetYMax(static_cast(ymax.get())); particles.SetYMax(static_cast(ymax.get())); + m_particles=particles; } stopTimer(); diff --git a/src/basic/REDUCE_STRUCT-OMPTarget.cpp b/src/basic/REDUCE_STRUCT-OMPTarget.cpp index e6031fdcf..fe6d157a9 100644 --- a/src/basic/REDUCE_STRUCT-OMPTarget.cpp +++ b/src/basic/REDUCE_STRUCT-OMPTarget.cpp @@ -53,24 +53,25 @@ void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Real_type xsum = 0.0; - Real_type xmin = 0.0; - Real_type xmax = 0.0; + Real_type xsum = 0.0, ysum = 0.0; + Real_type xmin = 0.0, ymin = 0.0; + Real_type xmax = 0.0, ymax = 0.0; - #pragma omp target is_device_ptr(vec) device( did ) map(tofrom:vsum, vmin, vmax) + #pragma omp target is_device_ptr(vec) device( did ) map(tofrom:xsum, xmin, xmax, ysum, ymin, ymax) #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static,1) \ reduction(+:xsum) \ reduction(min:xmin) \ - reduction(max:xmax) + reduction(max:xmax), \ + reduction(+:ysum), \ + reduction(min:ymin), \ + reduction(max:ymax) for (Index_type i = ibegin; i < iend; ++i ) { REDUCE_STRUCT_BODY; } - - particles.SetCenter(xsum/particles.N,0.0); - particles.SetXMin(xmin); - particles.SetXMax(xmax); - - + particles.SetCenter(xsum/particles.N,ysum/particles.N); + particles.SetXMin(xmin); particles.SetXMax(xmax); + particles.SetYMin(ymin); particles.SetYMax(ymax); + m_particles=particles; } stopTimer(); @@ -83,9 +84,9 @@ void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum xsum; - RAJA::ReduceMin xmin; - RAJA::ReduceMax xmax; + RAJA::ReduceSum xsum(0.0), ysum(0.0); + RAJA::ReduceMin xmin(0.0), ymin(0.0); + RAJA::ReduceMax xmax(0.0), ymax(0.0); RAJA::forall>( RAJA::RangeSegment(ibegin, iend), @@ -93,9 +94,10 @@ void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid) REDUCE_STRUCT_BODY_RAJA; }); - particles.SetCenter(static_cast(xsum.get()/(particles.N+1)),0.0); - particles.SetXMin(static_cast(xmin.get())); - particles.SetXMax(static_cast(xmax.get())); + particles.SetCenter(static_cast(xsum.get()/(particles.N)),ysum.get()/(particles.N)); + particles.SetXMin(static_cast(xmin.get())); particles.SetYMin(static_cast(xmax.get())); + particles.SetYMax(static_cast(ymax.get())); particles.SetYMax(static_cast(ymax.get())); + m_particles=particles; } stopTimer(); diff --git a/src/basic/REDUCE_STRUCT-Seq.cpp b/src/basic/REDUCE_STRUCT-Seq.cpp index f23e52511..267795d2c 100644 --- a/src/basic/REDUCE_STRUCT-Seq.cpp +++ b/src/basic/REDUCE_STRUCT-Seq.cpp @@ -30,27 +30,21 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid) switch ( vid ) { case Base_Seq : { - - for (int i=0;i Real_type { + auto init_struct_x_base_lam = [=](Index_type i) -> Real_type { return particles.x[i]; }; - + auto init_struct_y_base_lam = [=](Index_type i) -> Real_type { + return particles.y[i]; + }; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Real_type xsum = 0.0; - Real_type xmin = 0.0; - Real_type xmax = 0.0; + Real_type xsum = 0.0; Real_type ysum = 0.0; + Real_type xmin = 0.0; Real_type ymin = 0.0; + Real_type xmax = 0.0; Real_type ymax = 0.0; for (Index_type i = ibegin; i < iend; ++i ) { - xsum += init_struct_base_lam(i); - xmin = RAJA_MIN(xmin, init_struct_base_lam(i)); - xmax = RAJA_MAX(xmax, init_struct_base_lam(i)); + xsum += init_struct_x_base_lam(i); + xmin = RAJA_MIN(xmin, init_struct_x_base_lam(i)); + xmax = RAJA_MAX(xmax, init_struct_x_base_lam(i)); + ysum += init_struct_y_base_lam(i); + ymin = RAJA_MIN(ymin, init_struct_y_base_lam(i)); + ymax = RAJA_MAX(ymax, init_struct_y_base_lam(i)); } - particles.SetCenter(xsum/particles.N,0.0); - particles.SetXMin(xmin); - particles.SetXMax(xmax); - + particles.SetCenter(xsum/(particles.N),ysum/(particles.N)); + particles.SetXMin(xmin); particles.SetXMax(xmax); + particles.SetYMin(ymin); particles.SetYMax(ymax); + m_particles=particles; } stopTimer(); @@ -92,22 +85,22 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid) } case RAJA_Seq : { - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum xsum; - RAJA::ReduceMin xmin; - RAJA::ReduceMax xmax; + RAJA::ReduceSum xsum(0.0);, ysum(0.0); + RAJA::ReduceMin xmin(0.0);, ymin(0.0); + RAJA::ReduceMax xmax(0.0);, ymax(0.0); RAJA::forall( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { REDUCE_STRUCT_BODY_RAJA; }); - particles.SetCenter(static_cast(xsum.get()/(particles.N+1)),0.0); - particles.SetXMin(static_cast(xmin.get())); - particles.SetXMax(static_cast(xmax.get())); + particles.SetCenter(static_cast(xsum.get()/(particles.N)),static_cast(ysum.get()/(particles.N))); + particles.SetXMin(static_cast(xmin.get())); particles.SetXMax(static_cast(xmax.get())); + particles.SetYMin(static_cast(ymin.get())); particles.SetYMax(static_cast(ymax.get())); + m_particles=particles; } stopTimer(); diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp index ff69bdfd5..ed82c64bb 100644 --- a/src/basic/REDUCE_STRUCT.cpp +++ b/src/basic/REDUCE_STRUCT.cpp @@ -27,12 +27,12 @@ REDUCE_STRUCT::REDUCE_STRUCT(const RunParams& params) //setDefaultReps(5000); // Set reps to low value until we resolve RAJA omp-target // reduction performance issues - setDefaultReps(1); + setDefaultReps(50); setActualProblemSize( 101 ); // setActualProblemSize( getTargetProblemSize() ); - setItsPerRep( 1 ); + setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); setBytesPerRep( 6*sizeof(Real_type) + getActualProblemSize()); setFLOPsPerRep(1 * getActualProblemSize() + 1); diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index cefe5cfa7..49eefa989 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -19,11 +19,11 @@ /// ymin = RAJA_MIN(ymin, y[i]) ; ymax = RAJA_MAX(ymax, y[i]) ; /// } /// -/// particles.xcenter += xsum; +/// particles.xcenter = xsum; /// particles.xcenter /= particles.N /// particles.xmin = xmin; /// particles.xmax = xmax; -/// particles.ycenter += ysum; +/// particles.ycenter = ysum; /// particles.ycenter /= particles.N /// particles.ymin = ymin; /// particles.ymax = ymax; From 0b938ec86cb71a18dd931b5150b6bdccbadd7fd3 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Tue, 7 Sep 2021 16:00:41 -0500 Subject: [PATCH 089/392] removing stray ; from REDUCE_STRUCT-Seq.cpp --- src/basic/REDUCE_STRUCT-Seq.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Seq.cpp b/src/basic/REDUCE_STRUCT-Seq.cpp index 267795d2c..bfddf1a00 100644 --- a/src/basic/REDUCE_STRUCT-Seq.cpp +++ b/src/basic/REDUCE_STRUCT-Seq.cpp @@ -88,9 +88,9 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum xsum(0.0);, ysum(0.0); - RAJA::ReduceMin xmin(0.0);, ymin(0.0); - RAJA::ReduceMax xmax(0.0);, ymax(0.0); + RAJA::ReduceSum xsum(0.0), ysum(0.0); + RAJA::ReduceMin xmin(0.0), ymin(0.0); + RAJA::ReduceMax xmax(0.0), ymax(0.0); RAJA::forall( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { From 150ac7431b066d9c4647c31ccbcbeb506f0481bf Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Tue, 7 Sep 2021 16:18:56 -0500 Subject: [PATCH 090/392] updating rep count in reduce struct test --- src/basic/REDUCE_STRUCT-Cuda.cpp | 2 +- src/basic/REDUCE_STRUCT-Hip.cpp | 2 +- src/basic/REDUCE_STRUCT.cpp | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index fea9b09a9..43e3c5445 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -110,7 +110,7 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, void REDUCE_STRUCT::runCudaVariant(VariantID vid) { - const Index_type run_reps = 1; + const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index 9ff7cd22b..23c0127cd 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -110,7 +110,7 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, void REDUCE_STRUCT::runHipVariant(VariantID vid) { - const Index_type run_reps = 1; + const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp index ed82c64bb..f3b7d9b1e 100644 --- a/src/basic/REDUCE_STRUCT.cpp +++ b/src/basic/REDUCE_STRUCT.cpp @@ -23,13 +23,13 @@ namespace basic REDUCE_STRUCT::REDUCE_STRUCT(const RunParams& params) : KernelBase(rajaperf::Basic_REDUCE_STRUCT, params) { - setDefaultProblemSize(101); + setDefaultProblemSize(100); //setDefaultReps(5000); // Set reps to low value until we resolve RAJA omp-target // reduction performance issues setDefaultReps(50); - setActualProblemSize( 101 ); + setActualProblemSize( 100 ); // setActualProblemSize( getTargetProblemSize() ); setItsPerRep( getActualProblemSize() ); From 95851ab2d7ab7ee67cd446c6c3ef3d273c8f5e06 Mon Sep 17 00:00:00 2001 From: Kristi Date: Tue, 7 Sep 2021 15:28:27 -0700 Subject: [PATCH 091/392] Update PULL_REQUEST_TEMPLATE.md --- .github/PULL_REQUEST_TEMPLATE.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index d864de65f..4fb53adaa 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -5,6 +5,3 @@ - Modifies/refactors (class or method) (how?) - Fixes (issue number(s)) - Adds (specific feature) at the request of (project or person) - -*IMPORTANT NOTE! Remember to comment "LGTM" after pushing a commit to trigger the Gitlab CI. -Otherwise the CI will not run and the PR will never passed all the required tests!* From 91bd89e7f138a31109bd891cb049b4ad96a69685 Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Tue, 7 Sep 2021 20:24:27 -0500 Subject: [PATCH 092/392] Update REDUCE_STRUCT-Hip.cpp adding "=" to lmem assignment --- src/basic/REDUCE_STRUCT-Hip.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index 23c0127cd..79a18c93e 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -135,7 +135,7 @@ void REDUCE_STRUCT::runHipVariant(VariantID vid) hipErrchk( hipGetLastError() ); - Real_type lmem[6] {0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + Real_type lmem[6]={0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; Real_ptr plmem = &lmem[0]; getHipDeviceData(plmem, mem, 6); From a1a01229ae866d5c08afdf9da29f326749767058 Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Wed, 8 Sep 2021 11:44:44 -0500 Subject: [PATCH 093/392] fixing spacing --- src/basic/REDUCE_STRUCT-Cuda.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 43e3c5445..f061c2aa7 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -130,20 +130,20 @@ void REDUCE_STRUCT::runCudaVariant(VariantID vid) reduce_struct<<>> (particles.x, particles.y, - mem, mem+1,mem+2, //xcenter,xmin,xmax - mem+3,mem+4,mem+5, //ycenter,ymin,ymax - particles.N); + mem, mem+1,mem+2, //xcenter,xmin,xmax + mem+3,mem+4,mem+5, //ycenter,ymin,ymax + particles.N); cudaErrchk( cudaGetLastError() ); Real_type lmem[6]; Real_ptr plmem = &lmem[0]; getCudaDeviceData(plmem, mem, 6); - particles.SetCenter(lmem[0],lmem[3]); + particles.SetCenter(lmem[0],lmem[3]); particles.SetXMin(lmem[1]); particles.SetXMax(lmem[2]); particles.SetYMin(lmem[4]); - particles.SetYMax(lmem[5]); + particles.SetYMax(lmem[5]); m_particles=particles; } stopTimer(); @@ -164,13 +164,13 @@ void REDUCE_STRUCT::runCudaVariant(VariantID vid) RAJA::ReduceMax xmax=0.0, ymax=0.0; RAJA::forall< RAJA::cuda_exec >( - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE_STRUCT_BODY_RAJA; + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE_STRUCT_BODY_RAJA; }); particles.SetCenter(static_cast(xsum.get()/(particles.N)),ysum.get()/(particles.N)); - particles.SetXMin(static_cast(xmin.get())); particles.SetXMax(static_cast(xmax.get())); - particles.SetYMin(static_cast(ymin.get())); particles.SetYMax(static_cast(ymax.get())); + particles.SetXMin(static_cast(xmin.get())); particles.SetXMax(static_cast(xmax.get())); + particles.SetYMin(static_cast(ymin.get())); particles.SetYMax(static_cast(ymax.get())); m_particles=particles; } stopTimer(); From bc8db82c6d7c020af4aebe2bd97e3c7c6bebef9e Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Wed, 8 Sep 2021 11:46:17 -0500 Subject: [PATCH 094/392] updating problem size --- src/basic/REDUCE_STRUCT.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp index f3b7d9b1e..86f9834fb 100644 --- a/src/basic/REDUCE_STRUCT.cpp +++ b/src/basic/REDUCE_STRUCT.cpp @@ -23,14 +23,13 @@ namespace basic REDUCE_STRUCT::REDUCE_STRUCT(const RunParams& params) : KernelBase(rajaperf::Basic_REDUCE_STRUCT, params) { - setDefaultProblemSize(100); + setDefaultProblemSize(1000000); //setDefaultReps(5000); // Set reps to low value until we resolve RAJA omp-target // reduction performance issues setDefaultReps(50); - setActualProblemSize( 100 ); -// setActualProblemSize( getTargetProblemSize() ); + setActualProblemSize( getTargetProblemSize() ); setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); From 32a61177dae09a1a93f7223ca91552893eac239b Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Wed, 8 Sep 2021 11:47:32 -0500 Subject: [PATCH 095/392] updating FLOPs calc to be 2 real loads per rep --- src/basic/REDUCE_STRUCT.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp index 86f9834fb..04d63a01b 100644 --- a/src/basic/REDUCE_STRUCT.cpp +++ b/src/basic/REDUCE_STRUCT.cpp @@ -34,7 +34,7 @@ REDUCE_STRUCT::REDUCE_STRUCT(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); setBytesPerRep( 6*sizeof(Real_type) + getActualProblemSize()); - setFLOPsPerRep(1 * getActualProblemSize() + 1); + setFLOPsPerRep(2 * getActualProblemSize() + 1); setUsesFeature(Forall); setUsesFeature(Reduction); From 31b39f4df6a595b44272f2b4b0f5e58c2e276b8f Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Wed, 8 Sep 2021 11:51:41 -0500 Subject: [PATCH 096/392] fixing spacing --- src/basic/REDUCE_STRUCT-Hip.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index 79a18c93e..ffbb37ffe 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -129,9 +129,9 @@ void REDUCE_STRUCT::runHipVariant(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(particles.N, block_size); hipLaunchKernelGGL((reduce_struct), dim3(grid_size), dim3(block_size), 6*sizeof(Real_type)*block_size, 0, particles.x, particles.y, - mem, mem+1,mem+2, //xcenter,xmin,xmax - mem+3,mem+4,mem+5, //ycenter,ymin,ymax - particles.N); + mem, mem+1,mem+2, //xcenter,xmin,xmax + mem+3,mem+4,mem+5, //ycenter,ymin,ymax + particles.N); hipErrchk( hipGetLastError() ); @@ -139,11 +139,11 @@ void REDUCE_STRUCT::runHipVariant(VariantID vid) Real_ptr plmem = &lmem[0]; getHipDeviceData(plmem, mem, 6); - particles.SetCenter(lmem[0],lmem[3]); + particles.SetCenter(lmem[0],lmem[3]); particles.SetXMin(lmem[1]); particles.SetXMax(lmem[2]); particles.SetYMin(lmem[4]); - particles.SetYMax(lmem[5]); + particles.SetYMax(lmem[5]); m_particles=particles; } stopTimer(); @@ -164,16 +164,15 @@ void REDUCE_STRUCT::runHipVariant(VariantID vid) RAJA::ReduceMax xmax(0.0), ymax(0.0); RAJA::forall< RAJA::hip_exec >( - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE_STRUCT_BODY_RAJA; + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE_STRUCT_BODY_RAJA; }); particles.SetCenter(static_cast(xsum.get()/(particles.N)),ysum.get()/(particles.N)); - particles.SetXMin(static_cast(xmin.get())); particles.SetXMax(static_cast(xmax.get())); - particles.SetYMin(static_cast(ymin.get())); particles.SetYMax(static_cast(ymax.get())); + particles.SetXMin(static_cast(xmin.get())); particles.SetXMax(static_cast(xmax.get())); + particles.SetYMin(static_cast(ymin.get())); particles.SetYMax(static_cast(ymax.get())); m_particles=particles; - } stopTimer(); From 028bb4f5e3f71abcc544e860ca5e5466f25544b6 Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Wed, 8 Sep 2021 11:52:39 -0500 Subject: [PATCH 097/392] fixing spacing --- src/basic/REDUCE_STRUCT-Seq.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Seq.cpp b/src/basic/REDUCE_STRUCT-Seq.cpp index bfddf1a00..23c72935f 100644 --- a/src/basic/REDUCE_STRUCT-Seq.cpp +++ b/src/basic/REDUCE_STRUCT-Seq.cpp @@ -93,13 +93,13 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid) RAJA::ReduceMax xmax(0.0), ymax(0.0); RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - REDUCE_STRUCT_BODY_RAJA; + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + REDUCE_STRUCT_BODY_RAJA; }); particles.SetCenter(static_cast(xsum.get()/(particles.N)),static_cast(ysum.get()/(particles.N))); - particles.SetXMin(static_cast(xmin.get())); particles.SetXMax(static_cast(xmax.get())); - particles.SetYMin(static_cast(ymin.get())); particles.SetYMax(static_cast(ymax.get())); + particles.SetXMin(static_cast(xmin.get())); particles.SetXMax(static_cast(xmax.get())); + particles.SetYMin(static_cast(ymin.get())); particles.SetYMax(static_cast(ymax.get())); m_particles=particles; } stopTimer(); From d63859cc610a79e48346383d9ffff9609bfe2f57 Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Wed, 8 Sep 2021 13:20:48 -0500 Subject: [PATCH 098/392] updating setBytesPerRep for 2 loads --- src/basic/REDUCE_STRUCT.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp index 04d63a01b..7573f103f 100644 --- a/src/basic/REDUCE_STRUCT.cpp +++ b/src/basic/REDUCE_STRUCT.cpp @@ -33,7 +33,7 @@ REDUCE_STRUCT::REDUCE_STRUCT(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( 6*sizeof(Real_type) + getActualProblemSize()); + setBytesPerRep( 2*sizeof(Real_type) + getActualProblemSize()); setFLOPsPerRep(2 * getActualProblemSize() + 1); setUsesFeature(Forall); From 7c5b945098ae6880fa1a6e02220390aa29217dd5 Mon Sep 17 00:00:00 2001 From: Kristi Date: Wed, 8 Sep 2021 11:49:38 -0700 Subject: [PATCH 099/392] Update .gitlab/corona-templates.yml with cnone Co-authored-by: Adrien Bernede <51493078+adrienbernede@users.noreply.github.com> --- .gitlab/corona-templates.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/corona-templates.yml b/.gitlab/corona-templates.yml index 697cdb84c..abcc06fbf 100644 --- a/.gitlab/corona-templates.yml +++ b/.gitlab/corona-templates.yml @@ -15,7 +15,7 @@ - shell - corona rules: - - if: '$CI_COMMIT_BRANCH =~ /_qnone/ || $ON_CORONA == "OFF"' #run except if ... + - if: '$CI_COMMIT_BRANCH =~ /_cnone/ || $ON_CORONA == "OFF"' #run except if ... when: never - if: '$CI_JOB_NAME =~ /release_resources/' when: always From 4e8a9252d8f85e3d659ba4ce8a4b6bbbcd25485c Mon Sep 17 00:00:00 2001 From: Kristi Date: Wed, 8 Sep 2021 11:52:07 -0700 Subject: [PATCH 100/392] Update .gitlab/ruby-templates.yml with rnone Co-authored-by: Adrien Bernede <51493078+adrienbernede@users.noreply.github.com> --- .gitlab/ruby-templates.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/ruby-templates.yml b/.gitlab/ruby-templates.yml index ecba90dc6..a047bfb3c 100644 --- a/.gitlab/ruby-templates.yml +++ b/.gitlab/ruby-templates.yml @@ -15,7 +15,7 @@ - shell - ruby rules: - - if: '$CI_COMMIT_BRANCH =~ /_qnone/ || $ON_RUBY == "OFF"' #run except if ... + - if: '$CI_COMMIT_BRANCH =~ /_rnone/ || $ON_RUBY == "OFF"' #run except if ... when: never - if: '$CI_JOB_NAME =~ /release_resources/' when: always From efc1be037499e9e6793d2ba15141af4265240c87 Mon Sep 17 00:00:00 2001 From: Kristi Date: Wed, 8 Sep 2021 11:52:52 -0700 Subject: [PATCH 101/392] Update .gitlab/ruby-templates.yml Co-authored-by: Adrien Bernede <51493078+adrienbernede@users.noreply.github.com> --- .gitlab/ruby-templates.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/ruby-templates.yml b/.gitlab/ruby-templates.yml index a047bfb3c..cdbcd60e8 100644 --- a/.gitlab/ruby-templates.yml +++ b/.gitlab/ruby-templates.yml @@ -34,7 +34,7 @@ allocate_resources (on ruby): ### # In post-build phase, deallocate resources -# Note : make sure this is run even on build phase failure +# Note : make sure this is run even on build phase failure (see "rules:" in ".on_ruby:"). release_resources (on ruby): variables: GIT_STRATEGY: none From f0abc19c8a317abb9a728d6234dfef2a04c6893e Mon Sep 17 00:00:00 2001 From: Kristi Date: Wed, 8 Sep 2021 11:53:00 -0700 Subject: [PATCH 102/392] Update README.md Co-authored-by: Adrien Bernede <51493078+adrienbernede@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d2107473d..23b051f4e 100644 --- a/README.md +++ b/README.md @@ -714,7 +714,7 @@ above. # Continuous Integration -RAJAPerf Suite uses continuous integration to ensure that changes added to the repository are well integrated and tested for compatability with the rest of the existing code base. Our CI tests incude a variety of vetted configurations that run on different LC machines. +RAJAPerf Suite uses continuous integration to ensure that changes added to the repository are well integrated and tested for compatibility with the rest of the existing code base. Our CI tests include a variety of vetted configurations that run on different LC machines. RAJAPerf Suite shares its Gitlab CI workflow with other projects. The documentation is therefore [shared](https://radiuss-ci.readthedocs.io/en/latest/uberenv.html#ci). From 6d767c071ab0225ae8a8347fba85d7ddfb6757c2 Mon Sep 17 00:00:00 2001 From: Kristi Date: Wed, 8 Sep 2021 11:53:20 -0700 Subject: [PATCH 103/392] Update README.md Co-authored-by: Adrien Bernede <51493078+adrienbernede@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 23b051f4e..4231202d2 100644 --- a/README.md +++ b/README.md @@ -716,7 +716,7 @@ above. RAJAPerf Suite uses continuous integration to ensure that changes added to the repository are well integrated and tested for compatibility with the rest of the existing code base. Our CI tests include a variety of vetted configurations that run on different LC machines. -RAJAPerf Suite shares its Gitlab CI workflow with other projects. The documentation is therefore [shared](https://radiuss-ci.readthedocs.io/en/latest/uberenv.html#ci). +RAJAPerf Suite shares its Gitlab CI workflow with other projects. The documentation is therefore [shared](https://radiuss-ci.readthedocs.io/en/latest). * * * From e8c4088425914eaf745038b541b2bdb4ea10c125 Mon Sep 17 00:00:00 2001 From: Kristi Date: Wed, 8 Sep 2021 11:54:13 -0700 Subject: [PATCH 104/392] Update scripts/gitlab/build_and_test.sh Co-authored-by: Adrien Bernede <51493078+adrienbernede@users.noreply.github.com> --- scripts/gitlab/build_and_test.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 01cbb649c..2d386fb99 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -103,10 +103,6 @@ then echo "~ Build Dir: ${build_dir}" echo "~ Project Dir: ${project_dir}" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - echo "" - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - echo "~~~~ ENV ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "~~~~~ Building RAJA PerfSuite" From add6a75ec1dda49b019965434054cebd35f7f077 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Wed, 8 Sep 2021 15:06:07 -0500 Subject: [PATCH 105/392] make sure to reset results values every rep in REDUCE_STRUCT-Hip.cpp --- src/basic/REDUCE_STRUCT-Hip.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index ffbb37ffe..1cf94f10f 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -126,6 +126,7 @@ void REDUCE_STRUCT::runHipVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + hipErrchk(hipMemset(mem, 0.0, 6*sizeof(Real_type))); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(particles.N, block_size); hipLaunchKernelGGL((reduce_struct), dim3(grid_size), dim3(block_size), 6*sizeof(Real_type)*block_size, 0, particles.x, particles.y, From f41196e08374342151cef4de57f4cbac51b8660c Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Wed, 8 Sep 2021 15:38:16 -0500 Subject: [PATCH 106/392] cleaning up reduce struct test set up --- src/basic/REDUCE_STRUCT-Cuda.cpp | 5 +---- src/basic/REDUCE_STRUCT-Hip.cpp | 6 +----- src/basic/REDUCE_STRUCT.cpp | 6 ++++++ src/basic/REDUCE_STRUCT.hpp | 17 +++++------------ 4 files changed, 13 insertions(+), 21 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index f061c2aa7..8e33a02f4 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -30,10 +30,7 @@ namespace basic #define REDUCE_STRUCT_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(particles.x, m_x, particles.N); \ allocAndInitCudaDeviceData(particles.y, m_y, particles.N); \ - for (int i=0;i Date: Wed, 8 Sep 2021 15:40:12 -0500 Subject: [PATCH 107/392] make sure to reset results values every rep in REDUCE_STRUCT-Cuda.cpp --- src/basic/REDUCE_STRUCT-Cuda.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 8e33a02f4..82cdb9b7e 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -122,7 +122,7 @@ void REDUCE_STRUCT::runCudaVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - + cudaErrchk(cudaMemset(mem, 0.0, 6*sizeof(Real_type))); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(particles.N, block_size); reduce_struct<<>> @@ -132,7 +132,7 @@ void REDUCE_STRUCT::runCudaVariant(VariantID vid) particles.N); cudaErrchk( cudaGetLastError() ); - Real_type lmem[6]; + Real_type lmem[6]={0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; Real_ptr plmem = &lmem[0]; getCudaDeviceData(plmem, mem, 6); From a79144d66e573b74ef046e89306dcc9f53ba0635 Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Wed, 8 Sep 2021 15:42:12 -0500 Subject: [PATCH 108/392] updating flops per set to account for divides --- src/basic/REDUCE_STRUCT.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp index 5e0cad055..fd1c966d7 100644 --- a/src/basic/REDUCE_STRUCT.cpp +++ b/src/basic/REDUCE_STRUCT.cpp @@ -34,7 +34,7 @@ REDUCE_STRUCT::REDUCE_STRUCT(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); setBytesPerRep( 2*sizeof(Real_type) + getActualProblemSize()); - setFLOPsPerRep(2 * getActualProblemSize() + 1); + setFLOPsPerRep(2 * getActualProblemSize() + 2); setUsesFeature(Forall); setUsesFeature(Reduction); From 10537728c69d6e9292121326406e34ae9d254757 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Thu, 9 Sep 2021 11:52:28 -0500 Subject: [PATCH 109/392] moving reduce struct division outside kernel in Cuda/HIP to be consistent --- src/basic/REDUCE_STRUCT-Cuda.cpp | 10 +++++----- src/basic/REDUCE_STRUCT-Hip.cpp | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 82cdb9b7e..d3db1fa2c 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -37,8 +37,8 @@ namespace basic deallocCudaDeviceData(particles.y); \ __global__ void reduce_struct(Real_ptr x, Real_ptr y, - Real_ptr xcenter, Real_ptr xmin, Real_ptr xmax, - Real_ptr ycenter, Real_ptr ymin, Real_ptr ymax, + Real_ptr xsum, Real_ptr xmin, Real_ptr xmax, + Real_ptr ysum, Real_ptr ymin, Real_ptr ymax, Index_type iend) { @@ -94,11 +94,11 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, // serialized access to shared data; if ( threadIdx.x == 0 ) { - RAJA::atomicAdd( xcenter, pxsum[ 0 ] / (Real_type)iend); + RAJA::atomicAdd( xsum, pxsum[ 0 ] ); RAJA::atomicMin( xmin, pxmin[ 0 ] ); RAJA::atomicMax( xmax, pxmax[ 0 ] ); - RAJA::atomicAdd( ycenter, pysum[ 0 ] / (Real_type)iend); + RAJA::atomicAdd( xsum, pysum[ 0 ] ); RAJA::atomicMin( ymin, pymin[ 0 ] ); RAJA::atomicMax( ymax, pymax[ 0 ] ); } @@ -136,7 +136,7 @@ void REDUCE_STRUCT::runCudaVariant(VariantID vid) Real_ptr plmem = &lmem[0]; getCudaDeviceData(plmem, mem, 6); - particles.SetCenter(lmem[0],lmem[3]); + particles.SetCenter(lmem[0]/particles.N,lmem[3]/particles.N); particles.SetXMin(lmem[1]); particles.SetXMax(lmem[2]); particles.SetYMin(lmem[4]); diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index 09863a0e1..6884844b2 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -36,8 +36,8 @@ namespace basic deallocHipDeviceData(particles.y); \ __global__ void reduce_struct(Real_ptr x, Real_ptr y, - Real_ptr xcenter, Real_ptr xmin, Real_ptr xmax, - Real_ptr ycenter, Real_ptr ymin, Real_ptr ymax, + Real_ptr xsum, Real_ptr xmin, Real_ptr xmax, + Real_ptr ysum, Real_ptr ymin, Real_ptr ymax, Index_type iend) { @@ -93,11 +93,11 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, // serialized access to shared data; if ( threadIdx.x == 0 ) { - RAJA::atomicAdd( xcenter, pxsum[ 0 ] / (Real_type)iend); + RAJA::atomicAdd( xsum, pxsum[ 0 ] ); RAJA::atomicMin( xmin, pxmin[ 0 ] ); RAJA::atomicMax( xmax, pxmax[ 0 ] ); - RAJA::atomicAdd( ycenter, pysum[ 0 ] / (Real_type)iend); + RAJA::atomicAdd( ysum, pysum[ 0 ] ); RAJA::atomicMin( ymin, pymin[ 0 ] ); RAJA::atomicMax( ymax, pymax[ 0 ] ); } @@ -136,7 +136,7 @@ void REDUCE_STRUCT::runHipVariant(VariantID vid) Real_ptr plmem = &lmem[0]; getHipDeviceData(plmem, mem, 6); - particles.SetCenter(lmem[0],lmem[3]); + particles.SetCenter(lmem[0]/particles.N,lmem[3]/particles.N); particles.SetXMin(lmem[1]); particles.SetXMax(lmem[2]); particles.SetYMin(lmem[4]); From 07cdf1c71b5eb0203d4fa0b65b79d9432faa720b Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Thu, 9 Sep 2021 11:53:42 -0500 Subject: [PATCH 110/392] fixing setBytesPerRep calc --- src/basic/REDUCE_STRUCT.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp index fd1c966d7..f38458c92 100644 --- a/src/basic/REDUCE_STRUCT.cpp +++ b/src/basic/REDUCE_STRUCT.cpp @@ -33,8 +33,9 @@ REDUCE_STRUCT::REDUCE_STRUCT(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( 2*sizeof(Real_type) + getActualProblemSize()); + setBytesPerRep( 6*sizeof(Real_type) + 2*sizeof(Real_type)*getActualProblemSize()); setFLOPsPerRep(2 * getActualProblemSize() + 2); + setUsesFeature(Forall); setUsesFeature(Reduction); From 61dc6e641016f5a447471967cd18a6daa2e1bdbb Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Mon, 20 Sep 2021 15:55:16 -0700 Subject: [PATCH 111/392] initial changes to set up a multi-project ci --- .gitlab/lassen-jobs.yml | 1 + scripts/gitlab/build_and_test.sh | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index 8554ff1ad..0a3d0ad0a 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -12,6 +12,7 @@ clang_11_0_0: variables: SPEC: "%clang@11.0.0" + MULTI_PROJECT: "On" extends: .build_and_test_on_lassen clang_11_gcc_8: diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 2d386fb99..ca6b4f985 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -19,6 +19,7 @@ build_root=${BUILD_ROOT:-""} hostconfig=${HOST_CONFIG:-""} spec=${SPEC:-""} job_unique_id=${CI_JOB_ID:-""} +raja_version=${UPDATE_RAJA:-""} sys_type=${SYS_TYPE:-""} py_env_path=${PYTHON_ENVIRONMENT_PATH:-""} @@ -55,6 +56,11 @@ then prefix_opt="--prefix=${prefix}" fi + if [[ -n ${raja_version} ]] + then + spec="${spec} ^raja@${raja_version}" + fi + python scripts/uberenv/uberenv.py --spec="${spec}" ${prefix_opt} fi @@ -118,6 +124,9 @@ then rm -rf ${build_dir} 2>/dev/null mkdir -p ${build_dir} && cd ${build_dir} + #git checkout "task/kab163/set-up-multi-project-ci" + #git pull + date cmake \ -C ${hostconfig_path} \ From f580b51a99f9396191dd006becd3d8031701b63f Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 22 Sep 2021 16:15:58 -0700 Subject: [PATCH 112/392] Add the ability to exclude kernels Add command line options to exclude kernels, features, and variants --- src/common/Executor.cpp | 233 ++++++++++++++++++++++++++++++++++----- src/common/RunParams.cpp | 166 ++++++++++++++++++++++------ src/common/RunParams.hpp | 43 ++++++-- 3 files changed, 374 insertions(+), 68 deletions(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 17b772e3d..cc32c6cf6 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -63,6 +63,133 @@ void Executor::setupSuite() using KIDset = set; using VIDset = set; + // + // Determine which kernels to exclude from input. + // exclude_kern will be non-duplicated ordered set of IDs of kernel to exclude. + // + const Svector& exclude_kernel_input = run_params.getExcludeKernelInput(); + const Svector& exclude_feature_input = run_params.getExcludeFeatureInput(); + + KIDset exclude_kern; + + if ( !exclude_kernel_input.empty() ) { + + // Make list copy of exclude kernel name input to manipulate for + // processing potential group names and/or kernel names, next + Slist exclude_kern_names(exclude_kernel_input.begin(), exclude_kernel_input.end()); + + // + // Search exclude_kern_names for matching group names. + // groups2exclude will contain names of groups to exclude. + // + Svector groups2exclude; + for (Slist::iterator it = exclude_kern_names.begin(); it != exclude_kern_names.end(); ++it) + { + for (size_t ig = 0; ig < NumGroups; ++ig) { + const string& group_name = getGroupName(static_cast(ig)); + if ( group_name == *it ) { + groups2exclude.push_back(group_name); + } + } + } + + // + // If group name(s) found in exclude_kern_names, assemble kernels in group(s) + // to run and remove those group name(s) from exclude_kern_names list. + // + for (size_t ig = 0; ig < groups2exclude.size(); ++ig) { + const string& gname(groups2exclude[ig]); + + for (size_t ik = 0; ik < NumKernels; ++ik) { + KernelID kid = static_cast(ik); + if ( getFullKernelName(kid).find(gname) != string::npos ) { + exclude_kern.insert(kid); + } + } + + exclude_kern_names.remove(gname); + } + + // + // Look for matching names of individual kernels in remaining exclude_kern_names. + // + // Assemble invalid input for warning message. + // + Svector invalid; + + for (Slist::iterator it = exclude_kern_names.begin(); it != exclude_kern_names.end(); ++it) + { + bool found_it = false; + + for (size_t ik = 0; ik < NumKernels && !found_it; ++ik) { + KernelID kid = static_cast(ik); + if ( getKernelName(kid) == *it || getFullKernelName(kid) == *it ) { + exclude_kern.insert(kid); + found_it = true; + } + } + + if ( !found_it ) invalid.push_back(*it); + } + + run_params.setInvalidExcludeKernelInput(invalid); + + } + + if ( !exclude_feature_input.empty() ) { + + // First, check for invalid exclude_feature input. + // Assemble invalid input for warning message. + // + Svector invalid; + + for (size_t i = 0; i < exclude_feature_input.size(); ++i) { + bool found_it = false; + + for (size_t fid = 0; fid < NumFeatures && !found_it; ++fid) { + FeatureID tfid = static_cast(fid); + if ( getFeatureName(tfid) == exclude_feature_input[i] ) { + found_it = true; + } + } + + if ( !found_it ) invalid.push_back( exclude_feature_input[i] ); + } + run_params.setInvalidExcludeFeatureInput(invalid); + + // + // If feature input is valid, determine which kernels use + // input-specified features and add to set of kernels to run. + // + if ( run_params.getInvalidExcludeFeatureInput().empty() ) { + + for (size_t i = 0; i < exclude_feature_input.size(); ++i) { + + const string& feature = exclude_feature_input[i]; + + bool found_it = false; + for (size_t fid = 0; fid < NumFeatures && !found_it; ++fid) { + FeatureID tfid = static_cast(fid); + if ( getFeatureName(tfid) == feature ) { + found_it = true; + + for (int kid = 0; kid < NumKernels; ++kid) { + KernelID tkid = static_cast(kid); + KernelBase* kern = getKernelObject(tkid, run_params); + if ( kern->usesFeature(tfid) ) { + exclude_kern.insert( tkid ); + } + delete kern; + } // loop over kernels + + } // if input feature name matches feature id + } // loop over feature ids until name match is found + + } // loop over feature name input + + } // if feature name input is valid + } + // // Determine which kernels to execute from input. // run_kern will be non-duplicated ordered set of IDs of kernel to run. @@ -75,10 +202,13 @@ void Executor::setupSuite() if ( kernel_input.empty() && feature_input.empty() ) { // - // No kernels or fatures specified in input, run them all... + // No kernels or features specified in input, run them all... // - for (size_t ik = 0; ik < NumKernels; ++ik) { - run_kern.insert( static_cast(ik) ); + for (size_t kid = 0; kid < NumKernels; ++kid) { + KernelID tkid = static_cast(kid); + if (exclude_kern.find(tkid) == exclude_kern.end()) { + run_kern.insert( tkid ); + } } } else { @@ -130,7 +260,8 @@ void Executor::setupSuite() for (int kid = 0; kid < NumKernels; ++kid) { KernelID tkid = static_cast(kid); KernelBase* kern = getKernelObject(tkid, run_params); - if ( kern->usesFeature(tfid) ) { + if ( kern->usesFeature(tfid) && + exclude_kern.find(tkid) == exclude_kern.end() ) { run_kern.insert( tkid ); } delete kern; @@ -171,10 +302,11 @@ void Executor::setupSuite() for (size_t ig = 0; ig < groups2run.size(); ++ig) { const string& gname(groups2run[ig]); - for (size_t ik = 0; ik < NumKernels; ++ik) { - KernelID kid = static_cast(ik); - if ( getFullKernelName(kid).find(gname) != string::npos ) { - run_kern.insert(kid); + for (size_t kid = 0; kid < NumKernels; ++kid) { + KernelID tkid = static_cast(kid); + if ( getFullKernelName(tkid).find(gname) != string::npos && + exclude_kern.find(tkid) == exclude_kern.end()) { + run_kern.insert(tkid); } } @@ -192,10 +324,12 @@ void Executor::setupSuite() { bool found_it = false; - for (size_t ik = 0; ik < NumKernels && !found_it; ++ik) { - KernelID kid = static_cast(ik); - if ( getKernelName(kid) == *it || getFullKernelName(kid) == *it ) { - run_kern.insert(kid); + for (size_t kid = 0; kid < NumKernels && !found_it; ++kid) { + KernelID tkid = static_cast(kid); + if ( getKernelName(tkid) == *it || getFullKernelName(tkid) == *it ) { + if (exclude_kern.find(tkid) == exclude_kern.end()) { + run_kern.insert(tkid); + } found_it = true; } } @@ -220,6 +354,44 @@ void Executor::setupSuite() } } + + // + // Determine variants to execute from input. + // run_var will be non-duplicated ordered set of IDs of variants to run. + // + const Svector& exclude_variant_names = run_params.getExcludeVariantInput(); + + VIDset exclude_var; + + if ( !exclude_variant_names.empty() ) { + + // + // Parse input to determine which variants to exclude. + // + // Assemble invalid input for warning message. + // + + Svector invalid; + + for (size_t it = 0; it < exclude_variant_names.size(); ++it) { + bool found_it = false; + + for (VIDset::iterator vid_it = available_var.begin(); + vid_it != available_var.end(); ++vid_it) { + VariantID vid = *vid_it; + if ( getVariantName(vid) == exclude_variant_names[it] ) { + exclude_var.insert(vid); + found_it = true; + } + } + + if ( !found_it ) invalid.push_back(exclude_variant_names[it]); + } + + run_params.setInvalidExcludeVariantInput(invalid); + + } + // // Determine variants to execute from input. // run_var will be non-duplicated ordered set of IDs of variants to run. @@ -237,9 +409,11 @@ void Executor::setupSuite() for (VIDset::iterator vid_it = available_var.begin(); vid_it != available_var.end(); ++vid_it) { VariantID vid = *vid_it; - run_var.insert( vid ); - if ( getVariantName(vid) == run_params.getReferenceVariant() ) { - reference_vid = vid; + if (exclude_var.find(vid) == exclude_var.end()) { + run_var.insert( vid ); + if ( getVariantName(vid) == run_params.getReferenceVariant() ) { + reference_vid = vid; + } } } @@ -271,9 +445,11 @@ void Executor::setupSuite() vid_it != available_var.end(); ++vid_it) { VariantID vid = *vid_it; if ( getVariantName(vid) == variant_names[it] ) { - run_var.insert(vid); - if ( getVariantName(vid) == run_params.getReferenceVariant() ) { - reference_vid = vid; + if (exclude_var.find(vid) == exclude_var.end()) { + run_var.insert(vid); + if ( getVariantName(vid) == run_params.getReferenceVariant() ) { + reference_vid = vid; + } } found_it = true; } @@ -300,11 +476,13 @@ void Executor::setupSuite() // A message will be emitted later so user can sort it out... // - if ( !(run_params.getInvalidKernelInput().empty()) ) { + if ( !(run_params.getInvalidKernelInput().empty()) || + !(run_params.getInvalidExcludeKernelInput().empty()) ) { run_params.setInputState(RunParams::BadInput); - } else if ( !(run_params.getInvalidFeatureInput().empty()) ) { + } else if ( !(run_params.getInvalidFeatureInput().empty()) || + !(run_params.getInvalidExcludeFeatureInput().empty()) ) { run_params.setInputState(RunParams::BadInput); @@ -319,7 +497,8 @@ void Executor::setupSuite() } } - if ( !(run_params.getInvalidVariantInput().empty()) ) { + if ( !(run_params.getInvalidVariantInput().empty()) || + !(run_params.getInvalidExcludeVariantInput().empty()) ) { run_params.setInputState(RunParams::BadInput); @@ -469,7 +648,7 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const dash_width += itsrep_width + static_cast(sepchr.size()); string kernsrep_head("Kernels/rep"); - Index_type kernsrep_width = + Index_type kernsrep_width = max( static_cast(kernsrep_head.size()), static_cast(4) ); dash_width += kernsrep_width + static_cast(sepchr.size()); @@ -486,7 +665,7 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const static_cast(frsize) ) + 3; dash_width += flopsrep_width + static_cast(sepchr.size()); - str <getItsPerRep() << sepchr <getKernelsPerRep() << sepchr <getBytesPerRep() - << sepchr <getFLOPsPerRep() + << sepchr <getFLOPsPerRep() << endl; } @@ -529,9 +708,9 @@ void Executor::runSuite() vector warmup_kernels; - warmup_kernels.push_back(new basic::DAXPY(run_params)); - warmup_kernels.push_back(new basic::REDUCE3_INT(run_params)); - warmup_kernels.push_back(new algorithm::SORT(run_params)); + warmup_kernels.push_back(new basic::DAXPY(run_params)); + warmup_kernels.push_back(new basic::REDUCE3_INT(run_params)); + warmup_kernels.push_back(new algorithm::SORT(run_params)); for (size_t ik = 0; ik < warmup_kernels.size(); ++ik) { KernelBase* warmup_kernel = warmup_kernels[ik]; diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 115bdea55..e038863c1 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -38,10 +38,16 @@ RunParams::RunParams(int argc, char** argv) reference_variant(), kernel_input(), invalid_kernel_input(), + exclude_kernel_input(), + invalid_exclude_kernel_input(), variant_input(), invalid_variant_input(), + exclude_variant_input(), + invalid_exclude_variant_input(), feature_input(), invalid_feature_input(), + exclude_feature_input(), + invalid_exclude_feature_input(), outdir(), outfile_prefix("RAJAPerf") { @@ -70,19 +76,19 @@ RunParams::~RunParams() */ void RunParams::print(std::ostream& str) const { - str << "\n show_progress = " << show_progress; - str << "\n npasses = " << npasses; + str << "\n show_progress = " << show_progress; + str << "\n npasses = " << npasses; str << "\n rep_fact = " << rep_fact; str << "\n size_meaning = " << SizeMeaningToStr(getSizeMeaning()); str << "\n size = " << size; str << "\n size_factor = " << size_factor; - str << "\n pf_tol = " << pf_tol; - str << "\n checkrun_reps = " << checkrun_reps; - str << "\n reference_variant = " << reference_variant; - str << "\n outdir = " << outdir; - str << "\n outfile_prefix = " << outfile_prefix; + str << "\n pf_tol = " << pf_tol; + str << "\n checkrun_reps = " << checkrun_reps; + str << "\n reference_variant = " << reference_variant; + str << "\n outdir = " << outdir; + str << "\n outfile_prefix = " << outfile_prefix; - str << "\n kernel_input = "; + str << "\n kernel_input = "; for (size_t j = 0; j < kernel_input.size(); ++j) { str << "\n\t" << kernel_input[j]; } @@ -91,15 +97,33 @@ void RunParams::print(std::ostream& str) const str << "\n\t" << invalid_kernel_input[j]; } - str << "\n variant_input = "; + str << "\n exclude_kernel_input = "; + for (size_t j = 0; j < exclude_kernel_input.size(); ++j) { + str << "\n\t" << exclude_kernel_input[j]; + } + str << "\n invalid_exclude_kernel_input = "; + for (size_t j = 0; j < invalid_exclude_kernel_input.size(); ++j) { + str << "\n\t" << invalid_exclude_kernel_input[j]; + } + + str << "\n variant_input = "; for (size_t j = 0; j < variant_input.size(); ++j) { str << "\n\t" << variant_input[j]; } - str << "\n invalid_variant_input = "; + str << "\n invalid_variant_input = "; for (size_t j = 0; j < invalid_variant_input.size(); ++j) { str << "\n\t" << invalid_variant_input[j]; } + str << "\n exclude_variant_input = "; + for (size_t j = 0; j < exclude_variant_input.size(); ++j) { + str << "\n\t" << exclude_variant_input[j]; + } + str << "\n invalid_exclude_variant_input = "; + for (size_t j = 0; j < invalid_exclude_variant_input.size(); ++j) { + str << "\n\t" << invalid_exclude_variant_input[j]; + } + str << "\n feature_input = "; for (size_t j = 0; j < feature_input.size(); ++j) { str << "\n\t" << feature_input[j]; @@ -109,6 +133,15 @@ void RunParams::print(std::ostream& str) const str << "\n\t" << invalid_feature_input[j]; } + str << "\n exclude_feature_input = "; + for (size_t j = 0; j < exclude_feature_input.size(); ++j) { + str << "\n\t" << exclude_feature_input[j]; + } + str << "\n invalid_exclude_feature_input = "; + for (size_t j = 0; j < invalid_exclude_feature_input.size(); ++j) { + str << "\n\t" << invalid_exclude_feature_input[j]; + } + str << std::endl; str.flush(); } @@ -142,14 +175,14 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } else if ( opt == std::string("--print-kernels") || opt == std::string("-pk") ) { - - printFullKernelNames(std::cout); + + printFullKernelNames(std::cout); input_state = InfoRequest; - + } else if ( opt == std::string("--print-variants") || opt == std::string("-pv") ) { - printVariantNames(std::cout); + printVariantNames(std::cout); input_state = InfoRequest; } else if ( opt == std::string("--print-features") || @@ -169,28 +202,28 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) printKernelFeatures(std::cout); input_state = InfoRequest; - + } else if ( opt == std::string("--npasses") ) { i++; - if ( i < argc ) { + if ( i < argc ) { npasses = ::atoi( argv[i] ); } else { std::cout << "\nBad input:" - << " must give --npasses a value for number of passes (int)" - << std::endl; + << " must give --npasses a value for number of passes (int)" + << std::endl; input_state = BadInput; } } else if ( opt == std::string("--repfact") ) { i++; - if ( i < argc ) { + if ( i < argc ) { rep_fact = ::atof( argv[i] ); } else { std::cout << "\nBad input:" - << " must give --rep_fact a value (double)" - << std::endl; + << " must give --rep_fact a value (double)" + << std::endl; input_state = BadInput; } @@ -277,6 +310,22 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } } + } else if ( opt == std::string("--exclude-kernels") || + opt == std::string("-ek") ) { + + bool done = false; + i++; + while ( i < argc && !done ) { + opt = std::string(argv[i]); + if ( opt.at(0) == '-' ) { + i--; + done = true; + } else { + exclude_kernel_input.push_back(opt); + ++i; + } + } + } else if ( std::string(argv[i]) == std::string("--variants") || std::string(argv[i]) == std::string("-v") ) { @@ -293,6 +342,22 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } } + } else if ( std::string(argv[i]) == std::string("--exclude-variants") || + std::string(argv[i]) == std::string("-ev") ) { + + bool done = false; + i++; + while ( i < argc && !done ) { + opt = std::string(argv[i]); + if ( opt.at(0) == '-' ) { + i--; + done = true; + } else { + exclude_variant_input.push_back(opt); + ++i; + } + } + } else if ( std::string(argv[i]) == std::string("--features") || std::string(argv[i]) == std::string("-f") ) { @@ -309,6 +374,22 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } } + } else if ( std::string(argv[i]) == std::string("--exclude-features") || + std::string(argv[i]) == std::string("-ef") ) { + + bool done = false; + i++; + while ( i < argc && !done ) { + opt = std::string(argv[i]); + if ( opt.at(0) == '-' ) { + i--; + done = true; + } else { + exclude_feature_input.push_back(opt); + ++i; + } + } + } else if ( std::string(argv[i]) == std::string("--outdir") || std::string(argv[i]) == std::string("-od") ) { @@ -353,10 +434,10 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) if (input_state != BadInput) { input_state = DryRun; } - + } else if ( std::string(argv[i]) == std::string("--checkrun") ) { - input_state = CheckRun; + input_state = CheckRun; i++; if ( i < argc ) { @@ -370,10 +451,10 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } } else { - + input_state = BadInput; - std::string huh(argv[i]); + std::string huh(argv[i]); std::cout << "\nUnknown option: " << huh << std::endl; std::cout.flush(); @@ -392,7 +473,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) void RunParams::printHelpMessage(std::ostream& str) const { str << "\nUsage: ./raja-perf.exe [options]\n"; - str << "Valid options are:\n"; + str << "Valid options are:\n"; str << "\t --help, -h (print options with descriptions)\n\n"; @@ -411,7 +492,7 @@ void RunParams::printHelpMessage(std::ostream& str) const << "\t (print names of features used by each kernel)\n\n"; str << "\t --npasses [default is 1]\n" - << "\t (num passes through Suite)\n"; + << "\t (num passes through Suite)\n"; str << "\t\t Example...\n" << "\t\t --npasses 2 (runs complete Suite twice\n\n"; @@ -438,24 +519,43 @@ void RunParams::printHelpMessage(std::ostream& str) const << "\t\t -pftol 0.2 (RAJA kernel variants that run 20% or more slower than Base variants will be reported as OVER_TOL in FOM report)\n\n"; str << "\t --kernels, -k [Default is run all]\n" - << "\t (names of individual kernels and/or groups of kernels to run)\n"; + << "\t (names of individual kernels and/or groups of kernels to run)\n"; str << "\t\t Examples...\n" << "\t\t --kernels Polybench (run all kernels in Polybench group)\n" << "\t\t -k INIT3 MULADDSUB (run INIT3 and MULADDSUB kernels)\n" - << "\t\t -k INIT3 Apps (run INIT3 kernsl and all kernels in Apps group)\n\n"; + << "\t\t -k INIT3 Apps (run INIT3 kernel and all kernels in Apps group)\n\n"; + + str << "\t --exclude-kernels, -ek [Default is exclude none]\n" + << "\t (names of individual kernels and/or groups of kernels to exclude)\n"; + str << "\t\t Examples...\n" + << "\t\t --exclude-kernels Polybench (exclude all kernels in Polybench group)\n" + << "\t\t -ek INIT3 MULADDSUB (exclude INIT3 and MULADDSUB kernels)\n" + << "\t\t -ek INIT3 Apps (exclude INIT3 kernel and all kernels in Apps group)\n\n"; str << "\t --variants, -v [Default is run all]\n" - << "\t (names of variants to run)\n"; + << "\t (names of variants to run)\n"; str << "\t\t Examples...\n" << "\t\t --variants RAJA_CUDA (run all RAJA_CUDA kernel variants)\n" << "\t\t -v Base_Seq RAJA_CUDA (run Base_Seq and RAJA_CUDA variants)\n\n"; + str << "\t --exclude-variants, -ev [Default is exclude none]\n" + << "\t (names of variants to exclude)\n"; + str << "\t\t Examples...\n" + << "\t\t --exclude-variants RAJA_CUDA (exclude all RAJA_CUDA kernel variants)\n" + << "\t\t -ev Base_Seq RAJA_CUDA (exclude Base_Seq and RAJA_CUDA variants)\n\n"; + str << "\t --features, -f [Default is run all]\n" << "\t (names of features to run)\n"; str << "\t\t Examples...\n" << "\t\t --features Forall (run all kernels that use RAJA forall)\n" << "\t\t -f Forall Reduction (run all kernels that use RAJA forall or RAJA reductions)\n\n"; + str << "\t --exclude-features, -ef [Default is exclude none]\n" + << "\t (names of features to exclude)\n"; + str << "\t\t Examples...\n" + << "\t\t --exclude-features Forall (exclude all kernels that use RAJA forall)\n" + << "\t\t -ef Forall Reduction (exclude all kernels that use RAJA forall or RAJA reductions)\n\n"; + str << "\t --outdir, -od [Default is current directory]\n" << "\t (directory path for output data files)\n"; str << "\t\t Examples...\n" @@ -476,7 +576,7 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t --dryrun (print summary of how Suite will run without running it)\n\n"; str << "\t --checkrun [default is 1]\n" -<< "\t (run each kernel a given number of times; usually to check things are working properly or to reduce aggregate execution time)\n"; +<< "\t (run each kernel a given number of times; usually to check things are working properly or to reduce aggregate execution time)\n"; str << "\t\t Example...\n" << "\t\t --checkrun 2 (run each kernel twice)\n\n"; @@ -572,7 +672,7 @@ void RunParams::printKernelFeatures(std::ostream& str) const str << "\nAvailable kernels and features each uses:"; str << "\n-----------------------------------------\n"; for (int kid = 0; kid < NumKernels; ++kid) { - KernelID tkid = static_cast(kid); + KernelID tkid = static_cast(kid); /// RDH DISABLE COUPLE KERNEL if (tkid != Apps_COUPLE) { str << getFullKernelName(tkid) << std::endl; @@ -584,7 +684,7 @@ void RunParams::printKernelFeatures(std::ostream& str) const } } // loop over features delete kern; - } + } } // loop over kernels str.flush(); } diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index d9437bf9e..c25e58342 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -37,11 +37,11 @@ class RunParams { enum InputOpt { InfoRequest, /*!< option requesting information */ DryRun, /*!< report summary of how suite will run w/o running */ - CheckRun, /*!< run suite with small rep count to make sure + CheckRun, /*!< run suite with small rep count to make sure everything works properly */ - PerfRun, /*!< input defines a valid performance run, + PerfRun, /*!< input defines a valid performance run, suite will run as specified */ - BadInput, /*!< erroneous input given */ + BadInput, /*!< erroneous input given */ Undefined /*!< input not defined (yet) */ }; @@ -71,7 +71,7 @@ class RunParams { //@{ //! @name Methods to get/set input state - InputOpt getInputState() const { return input_state; } + InputOpt getInputState() const { return input_state; } /*! * \brief Set whether run parameters (from input) are valid. @@ -103,20 +103,34 @@ class RunParams { const std::string& getReferenceVariant() const { return reference_variant; } - const std::vector& getKernelInput() const + const std::vector& getKernelInput() const { return kernel_input; } void setInvalidKernelInput( std::vector& svec ) { invalid_kernel_input = svec; } const std::vector& getInvalidKernelInput() const { return invalid_kernel_input; } - const std::vector& getVariantInput() const + const std::vector& getExcludeKernelInput() const + { return exclude_kernel_input; } + void setInvalidExcludeKernelInput( std::vector& svec ) + { invalid_exclude_kernel_input = svec; } + const std::vector& getInvalidExcludeKernelInput() const + { return invalid_exclude_kernel_input; } + + const std::vector& getVariantInput() const { return variant_input; } void setInvalidVariantInput( std::vector& svec ) { invalid_variant_input = svec; } const std::vector& getInvalidVariantInput() const { return invalid_variant_input; } + const std::vector& getExcludeVariantInput() const + { return exclude_variant_input; } + void setInvalidExcludeVariantInput( std::vector& svec ) + { invalid_exclude_variant_input = svec; } + const std::vector& getInvalidExcludeVariantInput() const + { return invalid_exclude_variant_input; } + const std::vector& getFeatureInput() const { return feature_input; } void setInvalidFeatureInput( std::vector& svec ) @@ -124,6 +138,13 @@ class RunParams { const std::vector& getInvalidFeatureInput() const { return invalid_feature_input; } + const std::vector& getExcludeFeatureInput() const + { return exclude_feature_input; } + void setInvalidExcludeFeatureInput( std::vector& svec ) + { invalid_exclude_feature_input = svec; } + const std::vector& getInvalidExcludeFeatureInput() const + { return invalid_exclude_feature_input; } + const std::string& getOutputDirName() const { return outdir; } const std::string& getOutputFilePrefix() const { return outfile_prefix; } @@ -169,18 +190,24 @@ class RunParams { int checkrun_reps; /*!< Num reps each kernel is run in check run */ std::string reference_variant; /*!< Name of reference variant for speedup - calculations */ + calculations */ // - // Arrays to hold input strings for valid/invalid input. Helpful for + // Arrays to hold input strings for valid/invalid input. Helpful for // debugging command line args. // std::vector kernel_input; std::vector invalid_kernel_input; + std::vector exclude_kernel_input; + std::vector invalid_exclude_kernel_input; std::vector variant_input; std::vector invalid_variant_input; + std::vector exclude_variant_input; + std::vector invalid_exclude_variant_input; std::vector feature_input; std::vector invalid_feature_input; + std::vector exclude_feature_input; + std::vector invalid_exclude_feature_input; std::string outdir; /*!< Output directory name. */ std::string outfile_prefix; /*!< Prefix for output data file names. */ From d22ac6b90965baf25664b995484419fb5d539737 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Mon, 27 Sep 2021 15:34:10 -0700 Subject: [PATCH 113/392] modifying how to handle multi-project stuff --- scripts/gitlab/build_and_test.sh | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index ca6b4f985..b20bebca4 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -56,11 +56,6 @@ then prefix_opt="--prefix=${prefix}" fi - if [[ -n ${raja_version} ]] - then - spec="${spec} ^raja@${raja_version}" - fi - python scripts/uberenv/uberenv.py --spec="${spec}" ${prefix_opt} fi @@ -117,6 +112,15 @@ then # Map CPU core allocations declare -A core_counts=(["lassen"]=40 ["ruby"]=28 ["corona"]=32) + # If using Multi-project, set up the submodule + if [[ -n ${raja_version} ]] + then + cd tpl/RAJA + git checkout "task/kab163/set-up-multi-project-ci" + git pull + cd - + fi + # If building, then delete everything first # NOTE: 'cmake --build . -j core_counts' attempts to reduce individual build resources. # If core_counts does not contain hostname, then will default to '-j ', which should @@ -124,9 +128,6 @@ then rm -rf ${build_dir} 2>/dev/null mkdir -p ${build_dir} && cd ${build_dir} - #git checkout "task/kab163/set-up-multi-project-ci" - #git pull - date cmake \ -C ${hostconfig_path} \ From 510f6300916580a9a6d293b3eec5d40f7503f76b Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Tue, 28 Sep 2021 13:58:35 -0700 Subject: [PATCH 114/392] diffusion3D kernel --- src/CMakeLists.txt | 3 + src/apps/CMakeLists.txt | 6 + src/apps/DIFFUSION3DPA-Cuda.cpp | 296 ++++++++++++++++++++ src/apps/DIFFUSION3DPA-Hip.cpp | 298 ++++++++++++++++++++ src/apps/DIFFUSION3DPA-OMP.cpp | 267 ++++++++++++++++++ src/apps/DIFFUSION3DPA-OMPTarget.cpp | 39 +++ src/apps/DIFFUSION3DPA-Seq.cpp | 260 ++++++++++++++++++ src/apps/DIFFUSION3DPA.cpp | 97 +++++++ src/apps/DIFFUSION3DPA.hpp | 395 +++++++++++++++++++++++++++ src/common/RAJAPerfSuite.cpp | 6 + src/common/RAJAPerfSuite.hpp | 1 + 11 files changed, 1668 insertions(+) create mode 100644 src/apps/DIFFUSION3DPA-Cuda.cpp create mode 100644 src/apps/DIFFUSION3DPA-Hip.cpp create mode 100644 src/apps/DIFFUSION3DPA-OMP.cpp create mode 100644 src/apps/DIFFUSION3DPA-OMPTarget.cpp create mode 100644 src/apps/DIFFUSION3DPA-Seq.cpp create mode 100644 src/apps/DIFFUSION3DPA.cpp create mode 100644 src/apps/DIFFUSION3DPA.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 877bf5306..aff1d7326 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -36,6 +36,9 @@ blt_add_executable( apps/DEL_DOT_VEC_2D.cpp apps/DEL_DOT_VEC_2D-Seq.cpp apps/DEL_DOT_VEC_2D-OMPTarget.cpp + apps/DIFFUSION3DPA.cpp + apps/DIFFUSION3DPA-Seq.cpp + apps/DIFFUSION3DPA-OMPTarget.cpp apps/ENERGY.cpp apps/ENERGY-Seq.cpp apps/ENERGY-OMPTarget.cpp diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index 16b822cbb..a82bed339 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -15,6 +15,12 @@ blt_add_library( DEL_DOT_VEC_2D-Cuda.cpp DEL_DOT_VEC_2D-OMP.cpp DEL_DOT_VEC_2D-OMPTarget.cpp + DIFFUSION3DPA.cpp + DIFFUSION3DPA-Cuda.cpp + DIFFUSION3DPA-Hip.cpp + DIFFUSION3DPA-Seq.cpp + DIFFUSION3DPA-OMP.cpp + DIFFUSION3DPA-OMPTarget.cpp ENERGY.cpp ENERGY-Seq.cpp ENERGY-Hip.cpp diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp new file mode 100644 index 000000000..11b97885d --- /dev/null +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -0,0 +1,296 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DIFFUSION3DPA.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf { +namespace apps { + +#define DIFFUSION3DPA_DATA_SETUP_CUDA \ + allocAndInitCudaDeviceData(B, m_B, Q1D *D1D); \ + allocAndInitCudaDeviceData(Bt, m_Bt, Q1D *D1D); \ + allocAndInitCudaDeviceData(D, m_D, Q1D *Q1D *Q1D *m_NE); \ + allocAndInitCudaDeviceData(X, m_X, D1D *D1D *D1D *m_NE); \ + allocAndInitCudaDeviceData(Y, m_Y, D1D *D1D *D1D *m_NE); + +#define DIFFUSION3DPA_DATA_TEARDOWN_CUDA \ + getCudaDeviceData(m_Y, Y, D1D *D1D *D1D *m_NE); \ + deallocCudaDeviceData(B); \ + deallocCudaDeviceData(Bt); \ + deallocCudaDeviceData(D); \ + deallocCudaDeviceData(X); \ + deallocCudaDeviceData(Y); + +//#define USE_RAJA_UNROLL +#define RAJA_DIRECT_PRAGMA(X) _Pragma(#X) +#if defined(USE_RAJA_UNROLL) +#define RAJA_UNROLL(N) RAJA_DIRECT_PRAGMA(unroll(N)) +#else +#define RAJA_UNROLL(N) +#endif +#define FOREACH_THREAD(i, k, N) \ + for (int i = threadIdx.k; i < N; i += blockDim.k) + +__global__ void Diffusion3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt, + const Real_ptr D, const Real_ptr X, Real_ptr Y) { + + const int e = blockIdx.x; + + DIFFUSION3DPA_0_GPU + + FOREACH_THREAD(dy, y, D1D) { + FOREACH_THREAD(dx, x, D1D){ + DIFFUSION3DPA_1 + } + FOREACH_THREAD(dx, x, Q1D) { + DIFFUSION3DPA_2 + } + } + __syncthreads(); + FOREACH_THREAD(dy, y, D1D) { + FOREACH_THREAD(qx, x, Q1D) { + DIFFUSION3DPA_3 + } + } + __syncthreads(); + FOREACH_THREAD(qy, y, Q1D) { + FOREACH_THREAD(qx, x, Q1D) { + DIFFUSION3DPA_4 + } + } + __syncthreads(); + FOREACH_THREAD(qy, y, Q1D) { + FOREACH_THREAD(qx, x, Q1D) { + DIFFUSION3DPA_5 + } + } + + __syncthreads(); + FOREACH_THREAD(d, y, D1D) { + FOREACH_THREAD(q, x, Q1D) { + DIFFUSION3DPA_6 + } + } + + __syncthreads(); + FOREACH_THREAD(qy, y, Q1D) { + FOREACH_THREAD(dx, x, D1D) { + DIFFUSION3DPA_7 + } + } + __syncthreads(); + + FOREACH_THREAD(dy, y, D1D) { + FOREACH_THREAD(dx, x, D1D) { + DIFFUSION3DPA_8 + } + } + + __syncthreads(); + FOREACH_THREAD(dy, y, D1D) { + FOREACH_THREAD(dx, x, D1D) { + DIFFUSION3DPA_9 + } + } +} + +void DIFFUSION3DPA::runCudaVariant(VariantID vid) { + const Index_type run_reps = getRunReps(); + + DIFFUSION3DPA_DATA_SETUP; + + switch (vid) { + + case Base_CUDA: { + + DIFFUSION3DPA_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + dim3 nthreads_per_block(Q1D, Q1D, 1); + + Diffusion3DPA<<>>(NE, B, Bt, D, X, Y); + + cudaErrchk( cudaGetLastError() ); + } + stopTimer(); + + DIFFUSION3DPA_DATA_TEARDOWN_CUDA; + + break; + } + + case RAJA_CUDA: { + + DIFFUSION3DPA_DATA_SETUP_CUDA; + + using launch_policy = RAJA::expt::LaunchPolicy + >; + + using outer_x = RAJA::expt::LoopPolicy; + + using inner_x = RAJA::expt::LoopPolicy; + + using inner_y = RAJA::expt::LoopPolicy; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::expt::launch( + RAJA::expt::DEVICE, + RAJA::expt::Grid(RAJA::expt::Teams(NE), + RAJA::expt::Threads(Q1D, Q1D, 1)), + [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), + [&](int e) { + + DIFFUSION3DPA_0_GPU + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dx) { + DIFFUSION3DPA_1 + } + ); // RAJA::expt::loop + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int dx) { + DIFFUSION3DPA_2 + } + ); // RAJA::expt::loop + } // lambda (dy) + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qx) { + DIFFUSION3DPA_3 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qx) { + DIFFUSION3DPA_4 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qx) { + DIFFUSION3DPA_5 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int d) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int q) { + DIFFUSION3DPA_6 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dx) { + DIFFUSION3DPA_7 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dx) { + DIFFUSION3DPA_8 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dx) { + DIFFUSION3DPA_9 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + } // lambda (e) + ); // RAJA::expt::loop + + } // outer lambda (ctx) + ); // RAJA::expt::launch + + } // loop over kernel reps + stopTimer(); + + DIFFUSION3DPA_DATA_TEARDOWN_CUDA; + + break; + } + + default: { + + std::cout << "\n DIFFUSION3DPA : Unknown Cuda variant id = " << vid << std::endl; + break; + } + } +} + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp new file mode 100644 index 000000000..50c0daca8 --- /dev/null +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -0,0 +1,298 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DIFFUSION3DPA.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf { +namespace apps { + +#define DIFFUSION3DPA_DATA_SETUP_HIP \ + allocAndInitHipDeviceData(B, m_B, Q1D *D1D); \ + allocAndInitHipDeviceData(Bt, m_Bt, Q1D *D1D); \ + allocAndInitHipDeviceData(D, m_D, Q1D *Q1D *Q1D *m_NE); \ + allocAndInitHipDeviceData(X, m_X, D1D *D1D *D1D *m_NE); \ + allocAndInitHipDeviceData(Y, m_Y, D1D *D1D *D1D *m_NE); + +#define DIFFUSION3DPA_DATA_TEARDOWN_HIP \ + getHipDeviceData(m_Y, Y, D1D *D1D *D1D *m_NE); \ + deallocHipDeviceData(B); \ + deallocHipDeviceData(Bt); \ + deallocHipDeviceData(D); \ + deallocHipDeviceData(X); \ + deallocHipDeviceData(Y); + +//#define USE_RAJA_UNROLL +#define RAJA_DIRECT_PRAGMA(X) _Pragma(#X) +#if defined(USE_RAJA_UNROLL) +#define RAJA_UNROLL(N) RAJA_DIRECT_PRAGMA(unroll(N)) +#else +#define RAJA_UNROLL(N) +#endif +#define FOREACH_THREAD(i, k, N) \ + for (int i = threadIdx.k; i < N; i += blockDim.k) + +__global__ void Diffusion3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt, + const Real_ptr D, const Real_ptr X, Real_ptr Y) { + + const int e = hipBlockIdx_x; + + DIFFUSION3DPA_0_GPU + + FOREACH_THREAD(dy, y, D1D) { + FOREACH_THREAD(dx, x, D1D){ + DIFFUSION3DPA_1 + } + FOREACH_THREAD(dx, x, Q1D) { + DIFFUSION3DPA_2 + } + } + __syncthreads(); + FOREACH_THREAD(dy, y, D1D) { + FOREACH_THREAD(qx, x, Q1D) { + DIFFUSION3DPA_3 + } + } + __syncthreads(); + FOREACH_THREAD(qy, y, Q1D) { + FOREACH_THREAD(qx, x, Q1D) { + DIFFUSION3DPA_4 + } + } + __syncthreads(); + FOREACH_THREAD(qy, y, Q1D) { + FOREACH_THREAD(qx, x, Q1D) { + DIFFUSION3DPA_5 + } + } + + __syncthreads(); + FOREACH_THREAD(d, y, D1D) { + FOREACH_THREAD(q, x, Q1D) { + DIFFUSION3DPA_6 + } + } + + __syncthreads(); + FOREACH_THREAD(qy, y, Q1D) { + FOREACH_THREAD(dx, x, D1D) { + DIFFUSION3DPA_7 + } + } + __syncthreads(); + + FOREACH_THREAD(dy, y, D1D) { + FOREACH_THREAD(dx, x, D1D) { + DIFFUSION3DPA_8 + } + } + + __syncthreads(); + FOREACH_THREAD(dy, y, D1D) { + FOREACH_THREAD(dx, x, D1D) { + DIFFUSION3DPA_9 + } + } +} + +void DIFFUSION3DPA::runHipVariant(VariantID vid) { + const Index_type run_reps = getRunReps(); + + DIFFUSION3DPA_DATA_SETUP; + + switch (vid) { + + case Base_HIP: { + + DIFFUSION3DPA_DATA_SETUP_HIP; + + dim3 grid_size(NE); + dim3 block_size(Q1D, Q1D, 1); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + hipLaunchKernelGGL((Diffusion3DPA), dim3(grid_size), dim3(block_size), 0, 0, + NE, B, Bt, D, X, Y); + + hipErrchk( hipGetLastError() ); + + } + stopTimer(); + + DIFFUSION3DPA_DATA_TEARDOWN_HIP; + + break; + } + + case RAJA_HIP: { + + DIFFUSION3DPA_DATA_SETUP_HIP; + + using launch_policy = RAJA::expt::LaunchPolicy + >; + + using outer_x = RAJA::expt::LoopPolicy; + + using inner_x = RAJA::expt::LoopPolicy; + + using inner_y = RAJA::expt::LoopPolicy; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::expt::launch( + RAJA::expt::DEVICE, + RAJA::expt::Grid(RAJA::expt::Teams(NE), + RAJA::expt::Threads(Q1D, Q1D, 1)), + [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), + [&](int e) { + + DIFFUSION3DPA_0_GPU + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dx) { + DIFFUSION3DPA_1 + } + ); // RAJA::expt::loop + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int dx) { + DIFFUSION3DPA_2 + } + ); // RAJA::expt::loop + } // lambda (dy) + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qx) { + DIFFUSION3DPA_3 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qx) { + DIFFUSION3DPA_4 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qx) { + DIFFUSION3DPA_5 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int d) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int q) { + DIFFUSION3DPA_6 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dx) { + DIFFUSION3DPA_7 + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dx) { + DIFFUSION3DPA_8 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dx) { + DIFFUSION3DPA_9 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + } // lambda (e) + ); // RAJA::expt::loop + + } // outer lambda (ctx) + ); // RAJA::expt::launch + + } // loop over kernel reps + stopTimer(); + + DIFFUSION3DPA_DATA_TEARDOWN_HIP; + + break; + } + + default: { + + std::cout << "\n DIFFUSION3DPA : Unknown Hip variant id = " << vid << std::endl; + break; + } + } +} + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/apps/DIFFUSION3DPA-OMP.cpp b/src/apps/DIFFUSION3DPA-OMP.cpp new file mode 100644 index 000000000..e4b1b7a14 --- /dev/null +++ b/src/apps/DIFFUSION3DPA-OMP.cpp @@ -0,0 +1,267 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DIFFUSION3DPA.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf { +namespace apps { + +//#define USE_RAJA_UNROLL +#define RAJA_DIRECT_PRAGMA(X) _Pragma(#X) +#if defined(USE_RAJA_UNROLL) +#define RAJA_UNROLL(N) RAJA_DIRECT_PRAGMA(unroll(N)) +#else +#define RAJA_UNROLL(N) +#endif +#define FOREACH_THREAD(i, k, N) for (int i = 0; i < N; i++) + +void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + DIFFUSION3DPA_DATA_SETUP; + + switch (vid) { + + case Base_OpenMP: { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + +#pragma omp parallel for + for (int e = 0; e < NE; ++e) { + + DIFFUSION3DPA_0_CPU + + FOREACH_THREAD(dy, y, D1D) { + FOREACH_THREAD(dx, x, D1D){ + DIFFUSION3DPA_1 + } + FOREACH_THREAD(dx, x, Q1D) { + DIFFUSION3DPA_2 + } + } + + FOREACH_THREAD(dy, y, D1D) { + FOREACH_THREAD(qx, x, Q1D) { + DIFFUSION3DPA_3 + } + } + + FOREACH_THREAD(qy, y, Q1D) { + FOREACH_THREAD(qx, x, Q1D) { + DIFFUSION3DPA_4 + } + } + + FOREACH_THREAD(qy, y, Q1D) { + FOREACH_THREAD(qx, x, Q1D) { + DIFFUSION3DPA_5 + } + } + + FOREACH_THREAD(d, y, D1D) { + FOREACH_THREAD(q, x, Q1D) { + DIFFUSION3DPA_6 + } + } + + FOREACH_THREAD(qy, y, Q1D) { + FOREACH_THREAD(dx, x, D1D) { + DIFFUSION3DPA_7 + } + } + + FOREACH_THREAD(dy, y, D1D) { + FOREACH_THREAD(dx, x, D1D) { + DIFFUSION3DPA_8 + } + } + + FOREACH_THREAD(dy, y, D1D) { + FOREACH_THREAD(dx, x, D1D) { + DIFFUSION3DPA_9 + } + } + + } // element loop + } + stopTimer(); + + break; + } + + case RAJA_OpenMP: { + + //Currently Teams requires two policies if compiled with a device + using launch_policy = RAJA::expt::LaunchPolicy; + + using outer_x = RAJA::expt::LoopPolicy; + + using inner_x = RAJA::expt::LoopPolicy; + + using inner_y = RAJA::expt::LoopPolicy; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + //Grid is empty as the host does not need a compute grid to be specified + RAJA::expt::launch( + RAJA::expt::HOST, RAJA::expt::Grid(), + [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), + [&](int e) { + + DIFFUSION3DPA_0_CPU + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dx) { + DIFFUSION3DPA_1 + } + ); // RAJA::expt::loop + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int dx) { + DIFFUSION3DPA_2 + } + ); // RAJA::expt::loop + } // lambda (dy) + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qx) { + DIFFUSION3DPA_3 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qx) { + DIFFUSION3DPA_4 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qx) { + DIFFUSION3DPA_5 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int d) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int q) { + DIFFUSION3DPA_6 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dx) { + DIFFUSION3DPA_7 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dx) { + DIFFUSION3DPA_8 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dx) { + DIFFUSION3DPA_9 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + } // lambda (e) + ); // RAJA::expt::loop + + } // outer lambda (ctx) + ); // // RAJA::expt::launch + + } // loop over kernel reps + stopTimer(); + + return; + } + + default: + std::cout << "\n DIFFUSION3DPA : Unknown OpenMP variant id = " << vid + << std::endl; + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/DIFFUSION3DPA-OMPTarget.cpp b/src/apps/DIFFUSION3DPA-OMPTarget.cpp new file mode 100644 index 000000000..26ae3bc84 --- /dev/null +++ b/src/apps/DIFFUSION3DPA-OMPTarget.cpp @@ -0,0 +1,39 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DIFFUSION3DPA.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf { +namespace apps { + + +void DIFFUSION3DPA::runOpenMPTargetVariant(VariantID vid) { + const Index_type run_reps = getRunReps(); + + switch (vid) { + + default: { + + std::cout << "\n DIFFUSION3DPA : Unknown OpenMPTarget variant id = " << vid << std::endl; + break; + } + } +} + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp new file mode 100644 index 000000000..b72713b4b --- /dev/null +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -0,0 +1,260 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DIFFUSION3DPA.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf { +namespace apps { + +//#define USE_RAJA_UNROLL +#define RAJA_DIRECT_PRAGMA(X) _Pragma(#X) +#if defined(USE_RAJA_UNROLL) +#define RAJA_UNROLL(N) RAJA_DIRECT_PRAGMA(unroll(N)) +#else +#define RAJA_UNROLL(N) +#endif +#define FOREACH_THREAD(i, k, N) for (int i = 0; i < N; i++) + +void DIFFUSION3DPA::runSeqVariant(VariantID vid) { + const Index_type run_reps = getRunReps(); + + DIFFUSION3DPA_DATA_SETUP; + + switch (vid) { + + case Base_Seq: { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (int e = 0; e < NE; ++e) { + + DIFFUSION3DPA_0_CPU + + FOREACH_THREAD(dy, y, D1D) { + FOREACH_THREAD(dx, x, D1D){ + DIFFUSION3DPA_1 + } + FOREACH_THREAD(dx, x, Q1D) { + DIFFUSION3DPA_2 + } + } + + FOREACH_THREAD(dy, y, D1D) { + FOREACH_THREAD(qx, x, Q1D) { + DIFFUSION3DPA_3 + } + } + + FOREACH_THREAD(qy, y, Q1D) { + FOREACH_THREAD(qx, x, Q1D) { + DIFFUSION3DPA_4 + } + } + + FOREACH_THREAD(qy, y, Q1D) { + FOREACH_THREAD(qx, x, Q1D) { + DIFFUSION3DPA_5 + } + } + + FOREACH_THREAD(d, y, D1D) { + FOREACH_THREAD(q, x, Q1D) { + DIFFUSION3DPA_6 + } + } + + FOREACH_THREAD(qy, y, Q1D) { + FOREACH_THREAD(dx, x, D1D) { + DIFFUSION3DPA_7 + } + } + + FOREACH_THREAD(dy, y, D1D) { + FOREACH_THREAD(dx, x, D1D) { + DIFFUSION3DPA_8 + } + } + + FOREACH_THREAD(dy, y, D1D) { + FOREACH_THREAD(dx, x, D1D) { + DIFFUSION3DPA_9 + } + } + + } // element loop + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case RAJA_Seq: { + + //Currently Teams requires two policies if compiled with a device + using launch_policy = RAJA::expt::LaunchPolicy; + + using outer_x = RAJA::expt::LoopPolicy; + + using inner_x = RAJA::expt::LoopPolicy; + + using inner_y = RAJA::expt::LoopPolicy; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::expt::launch( + RAJA::expt::HOST, RAJA::expt::Grid(), + [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), + [&](int e) { + + DIFFUSION3DPA_0_CPU + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dx) { + DIFFUSION3DPA_1 + } + ); // RAJA::expt::loop + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int dx) { + DIFFUSION3DPA_2 + } + ); // RAJA::expt::loop + } // lambda (dy) + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qx) { + DIFFUSION3DPA_3 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qx) { + DIFFUSION3DPA_4 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qx) { + DIFFUSION3DPA_5 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int d) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int q) { + DIFFUSION3DPA_6 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dx) { + DIFFUSION3DPA_7 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dx) { + DIFFUSION3DPA_8 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dx) { + DIFFUSION3DPA_9 + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + } // lambda (e) + ); // RAJA::expt::loop + + } // outer lambda (ctx) + ); // RAJA::expt::launch + + } // loop over kernel reps + stopTimer(); + + return; + } +#endif // RUN_RAJA_SEQ + + default: + std::cout << "\n DIFFUSION3DPA : Unknown Seq variant id = " << vid << std::endl; + } +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp new file mode 100644 index 000000000..16889aa6b --- /dev/null +++ b/src/apps/DIFFUSION3DPA.cpp @@ -0,0 +1,97 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DIFFUSION3DPA.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/DataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +DIFFUSION3DPA::DIFFUSION3DPA(const RunParams& params) + : KernelBase(rajaperf::Apps_DIFFUSION3DPA, params) +{ + m_NE_default = 8000; + + setDefaultProblemSize(m_NE_default*Q1D*Q1D*Q1D); + setDefaultReps(50); + + m_NE = std::max(getTargetProblemSize()/(Q1D*Q1D*Q1D), Index_type(1)); + + setActualProblemSize( m_NE*Q1D*Q1D*Q1D ); + + setItsPerRep(getActualProblemSize()); + setKernelsPerRep(1); + + setBytesPerRep( Q1D*D1D*sizeof(Real_type) + + Q1D*D1D*sizeof(Real_type) + + Q1D*Q1D*Q1D*m_NE*sizeof(Real_type) + + D1D*D1D*D1D*m_NE*sizeof(Real_type) + + D1D*D1D*D1D*m_NE*sizeof(Real_type) ); + + setFLOPsPerRep(m_NE * (2 * D1D * D1D * D1D * Q1D + + 2 * D1D * D1D * Q1D * Q1D + + 2 * D1D * Q1D * Q1D * Q1D + Q1D * Q1D * Q1D + + 2 * Q1D * Q1D * Q1D * D1D + + 2 * Q1D * Q1D * D1D * D1D + + 2 * Q1D * D1D * D1D * D1D + D1D * D1D * D1D)); + setUsesFeature(Teams); + + setVariantDefined( Base_Seq ); + setVariantDefined( RAJA_Seq ); + + setVariantDefined( Base_OpenMP ); + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( Base_CUDA ); + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( Base_HIP ); + setVariantDefined( RAJA_HIP ); + +} + +DIFFUSION3DPA::~DIFFUSION3DPA() +{ +} + +void DIFFUSION3DPA::setUp(VariantID vid) +{ + + allocAndInitDataConst(m_B, int(Q1D*D1D), Real_type(1.0), vid); + allocAndInitDataConst(m_Bt,int(Q1D*D1D), Real_type(1.0), vid); + allocAndInitDataConst(m_D, int(Q1D*Q1D*Q1D*m_NE), Real_type(1.0), vid); + allocAndInitDataConst(m_X, int(D1D*D1D*D1D*m_NE), Real_type(1.0), vid); + allocAndInitDataConst(m_Y, int(D1D*D1D*D1D*m_NE), Real_type(0.0), vid); +} + +void DIFFUSION3DPA::updateChecksum(VariantID vid) +{ + checksum[vid] += calcChecksum(m_Y, D1D*D1D*D1D*m_NE); +} + +void DIFFUSION3DPA::tearDown(VariantID vid) +{ + (void) vid; + + deallocData(m_B); + deallocData(m_Bt); + deallocData(m_D); + deallocData(m_X); + deallocData(m_Y); +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp new file mode 100644 index 000000000..0d21eee4d --- /dev/null +++ b/src/apps/DIFFUSION3DPA.hpp @@ -0,0 +1,395 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// Action of 3D Mass matrix via partial assembly +/// +/// Based on MFEM's/CEED algorithms. +/// Reference implementation +/// https://github.com/mfem/mfem/blob/master/fem/bilininteg_mass_pa.cpp#L925 +/// +/// for (int e = 0; e < NE; ++e) { +/// +/// constexpr int MQ1 = Q1D; +/// constexpr int MD1 = D1D; +/// constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; +/// double sDQ[MQ1 * MD1]; +/// double(*Bsmem)[MD1] = (double(*)[MD1])sDQ; +/// double(*Btsmem)[MQ1] = (double(*)[MQ1])sDQ; +/// double sm0[MDQ * MDQ * MDQ]; +/// double sm1[MDQ * MDQ * MDQ]; +/// double(*Xsmem)[MD1][MD1] = (double(*)[MD1][MD1])sm0; +/// double(*DDQ)[MD1][MQ1] = (double(*)[MD1][MQ1])sm1; +/// double(*DQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm0; +/// double(*QQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm1; +/// double(*QQD)[MQ1][MD1] = (double(*)[MQ1][MD1])sm0; +/// double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1; +/// +/// for(int dy=0; dy MD1) ? MQ1 : MD1; \ + double sDQ[MQ1 * MD1]; \ + double(*Bsmem)[MD1] = (double(*)[MD1])sDQ; \ + double(*Btsmem)[MQ1] = (double(*)[MQ1])sDQ; \ + double sm0[MDQ * MDQ * MDQ]; \ + double sm1[MDQ * MDQ * MDQ]; \ + double(*Xsmem)[MD1][MD1] = (double(*)[MD1][MD1])sm0; \ + double(*DDQ)[MD1][MQ1] = (double(*)[MD1][MQ1])sm1; \ + double(*DQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm0; \ + double(*QQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm1; \ + double(*QQD)[MQ1][MD1] = (double(*)[MQ1][MD1])sm0; \ + double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1; + +#define DIFFUSION3DPA_0_GPU \ + constexpr int MQ1 = Q1D; \ + constexpr int MD1 = D1D; \ + constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + RAJA_TEAM_SHARED double sDQ[MQ1 * MD1]; \ + double(*Bsmem)[MD1] = (double(*)[MD1])sDQ; \ + double(*Btsmem)[MQ1] = (double(*)[MQ1])sDQ; \ + RAJA_TEAM_SHARED double sm0[MDQ * MDQ * MDQ]; \ + RAJA_TEAM_SHARED double sm1[MDQ * MDQ * MDQ]; \ + double(*Xsmem)[MD1][MD1] = (double(*)[MD1][MD1])sm0; \ + double(*DDQ)[MD1][MQ1] = (double(*)[MD1][MQ1])sm1; \ + double(*DQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm0; \ + double(*QQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm1; \ + double(*QQD)[MQ1][MD1] = (double(*)[MQ1][MD1])sm0; \ + double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1; + +#define DIFFUSION3DPA_1 \ + RAJA_UNROLL(MD1) \ +for (int dz = 0; dz< D1D; ++dz) { \ +Xsmem[dz][dy][dx] = X_(dx, dy, dz, e); \ +} + +#define DIFFUSION3DPA_2 \ + Bsmem[dx][dy] = B_(dx, dy); + +// 2 * D1D * D1D * D1D * Q1D +#define DIFFUSION3DPA_3 \ + double u[D1D]; \ +RAJA_UNROLL(MD1) \ +for (int dz = 0; dz < D1D; dz++) { \ +u[dz] = 0; \ +} \ +RAJA_UNROLL(MD1) \ +for (int dx = 0; dx < D1D; ++dx) { \ +RAJA_UNROLL(MD1) \ +for (int dz = 0; dz < D1D; ++dz) { \ +u[dz] += Xsmem[dz][dy][dx] * Bsmem[qx][dx]; \ +} \ +} \ +RAJA_UNROLL(MD1) \ +for (int dz = 0; dz < D1D; ++dz) { \ +DDQ[dz][dy][qx] = u[dz]; \ +} + +//2 * D1D * D1D * Q1D * Q1D +#define DIFFUSION3DPA_4 \ + double u[D1D]; \ + RAJA_UNROLL(MD1) \ + for (int dz = 0; dz < D1D; dz++) { \ + u[dz] = 0; \ + } \ + RAJA_UNROLL(MD1) \ + for (int dy = 0; dy < D1D; ++dy) { \ + RAJA_UNROLL(MD1) \ + for (int dz = 0; dz < D1D; dz++) { \ + u[dz] += DDQ[dz][dy][qx] * Bsmem[qy][dy]; \ + } \ + } \ + RAJA_UNROLL(MD1) \ + for (int dz = 0; dz < D1D; dz++) { \ + DQQ[dz][qy][qx] = u[dz]; \ + } + +//2 * D1D * Q1D * Q1D * Q1D + Q1D * Q1D * Q1D +#define DIFFUSION3DPA_5 \ + double u[Q1D]; \ + RAJA_UNROLL(MQ1) \ + for (int qz = 0; qz < Q1D; qz++) { \ + u[qz] = 0; \ + } \ + RAJA_UNROLL(MD1) \ + for (int dz = 0; dz < D1D; ++dz) { \ + RAJA_UNROLL(MQ1) \ + for (int qz = 0; qz < Q1D; qz++) { \ + u[qz] += DQQ[dz][qy][qx] * Bsmem[qz][dz]; \ + } \ + } \ + RAJA_UNROLL(MQ1) \ + for (int qz = 0; qz < Q1D; qz++) { \ + QQQ[qz][qy][qx] = u[qz] * D_(qx, qy, qz, e); \ + } + +#define DIFFUSION3DPA_6 \ + Btsmem[d][q] = Bt_(q, d); + +//2 * Q1D * Q1D * Q1D * D1D +#define DIFFUSION3DPA_7 \ + double u[Q1D]; \ +RAJA_UNROLL(MQ1) \ +for (int qz = 0; qz < Q1D; ++qz) { \ + u[qz] = 0; \ + } \ +RAJA_UNROLL(MQ1) \ +for (int qx = 0; qx < Q1D; ++qx) { \ + RAJA_UNROLL(MQ1) \ + for (int qz = 0; qz < Q1D; ++qz) { \ + u[qz] += QQQ[qz][qy][qx] * Btsmem[dx][qx]; \ + } \ + } \ +RAJA_UNROLL(MQ1) \ +for (int qz = 0; qz < Q1D; ++qz) { \ + QQD[qz][qy][dx] = u[qz]; \ + } + +// 2 * Q1D * Q1D * D1D * D1D +#define DIFFUSION3DPA_8 \ + double u[Q1D]; \ + RAJA_UNROLL(MQ1) \ + for (int qz = 0; qz < Q1D; ++qz) { \ + u[qz] = 0; \ + } \ + RAJA_UNROLL(MQ1) \ + for (int qy = 0; qy < Q1D; ++qy) { \ + RAJA_UNROLL(MQ1) \ + for (int qz = 0; qz < Q1D; ++qz) { \ + u[qz] += QQD[qz][qy][dx] * Btsmem[dy][qy]; \ + } \ + } \ + RAJA_UNROLL(MQ1) \ + for (int qz = 0; qz < Q1D; ++qz) { \ + QDD[qz][dy][dx] = u[qz]; \ + } + +//2 * Q1D * D1D * D1D * D1D + D1D * D1D * D1D +#define DIFFUSION3DPA_9 \ + double u[D1D]; \ + RAJA_UNROLL(MD1) \ + for (int dz = 0; dz < D1D; ++dz) { \ + u[dz] = 0; \ + } \ + RAJA_UNROLL(MQ1) \ + for (int qz = 0; qz < Q1D; ++qz) { \ + RAJA_UNROLL(MD1) \ + for (int dz = 0; dz < D1D; ++dz) { \ + u[dz] += QDD[qz][dy][dx] * Btsmem[dz][qz]; \ + } \ + } \ + RAJA_UNROLL(MD1) \ + for (int dz = 0; dz < D1D; ++dz) { \ + Y_(dx, dy, dz, e) += u[dz]; \ + } + + +#if defined(RAJA_ENABLE_CUDA) + using m3d_device_launch = RAJA::expt::cuda_launch_t; + using m3d_gpu_block_x_policy = RAJA::cuda_block_x_direct; + using m3d_gpu_thread_x_policy = RAJA::cuda_thread_x_loop; + using m3d_gpu_thread_y_policy = RAJA::cuda_thread_y_loop; +#endif + +#if defined(RAJA_ENABLE_HIP) + using m3d_device_launch = RAJA::expt::hip_launch_t; + using m3d_gpu_block_x_policy = RAJA::hip_block_x_direct; + using m3d_gpu_thread_x_policy = RAJA::hip_thread_x_loop; + using m3d_gpu_thread_y_policy = RAJA::hip_thread_y_loop; +#endif + +namespace rajaperf +{ +class RunParams; + +namespace apps +{ + +class DIFFUSION3DPA : public KernelBase +{ +public: + + DIFFUSION3DPA(const RunParams& params); + + ~DIFFUSION3DPA(); + + void setUp(VariantID vid); + void updateChecksum(VariantID vid); + void tearDown(VariantID vid); + + void runSeqVariant(VariantID vid); + void runOpenMPVariant(VariantID vid); + void runCudaVariant(VariantID vid); + void runHipVariant(VariantID vid); + void runOpenMPTargetVariant(VariantID vid); + +private: + + Real_ptr m_B; + Real_ptr m_Bt; + Real_ptr m_D; + Real_ptr m_X; + Real_ptr m_Y; + + Index_type m_NE; + Index_type m_NE_default; +}; + +} // end namespace apps +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 41fcbd5e9..748fb1325 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -72,6 +72,7 @@ // #include "apps/WIP-COUPLE.hpp" #include "apps/DEL_DOT_VEC_2D.hpp" +#include "apps/DIFFUSION3DPA.hpp" #include "apps/ENERGY.hpp" #include "apps/FIR.hpp" #include "apps/HALOEXCHANGE.hpp" @@ -197,6 +198,7 @@ static const std::string KernelNames [] = // std::string("Apps_COUPLE"), std::string("Apps_DEL_DOT_VEC_2D"), + std::string("Apps_DIFFUSION3DPA"), std::string("Apps_ENERGY"), std::string("Apps_FIR"), std::string("Apps_HALOEXCHANGE"), @@ -616,6 +618,10 @@ KernelBase* getKernelObject(KernelID kid, kernel = new apps::DEL_DOT_VEC_2D(run_params); break; } + case Apps_DIFFUSION3DPA : { + kernel = new apps::DIFFUSION3DPA(run_params); + break; + } case Apps_ENERGY : { kernel = new apps::ENERGY(run_params); break; diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 367aeed72..ca4f10f1d 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -127,6 +127,7 @@ enum KernelID { // Apps_COUPLE, Apps_DEL_DOT_VEC_2D, + Apps_DIFFUSION3DPA, Apps_ENERGY, Apps_FIR, Apps_HALOEXCHANGE, From 233e9c03142103c8f3d94bcaedb6442a62bc73ee Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Tue, 28 Sep 2021 14:09:38 -0700 Subject: [PATCH 115/392] need to pull first --- scripts/gitlab/build_and_test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index b20bebca4..7d0095883 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -116,6 +116,7 @@ then if [[ -n ${raja_version} ]] then cd tpl/RAJA + git pull origin develop git checkout "task/kab163/set-up-multi-project-ci" git pull cd - From dcf56d9d4cb2926ac58db8ce526606fff7b4d7e0 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Tue, 28 Sep 2021 15:22:15 -0700 Subject: [PATCH 116/392] add reference impl -commented version --- src/apps/DIFFUSION3DPA.hpp | 475 ++++++++++++++++++------------------- 1 file changed, 228 insertions(+), 247 deletions(-) diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index 0d21eee4d..8442bb1a5 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -11,143 +11,271 @@ /// /// Based on MFEM's/CEED algorithms. /// Reference implementation -/// https://github.com/mfem/mfem/blob/master/fem/bilininteg_mass_pa.cpp#L925 +/// https://github.com/mfem/mfem/blob/master/fem/bilininteg_diffusion_pa.cpp /// /// for (int e = 0; e < NE; ++e) { /// /// constexpr int MQ1 = Q1D; /// constexpr int MD1 = D1D; -/// constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; -/// double sDQ[MQ1 * MD1]; -/// double(*Bsmem)[MD1] = (double(*)[MD1])sDQ; -/// double(*Btsmem)[MQ1] = (double(*)[MQ1])sDQ; -/// double sm0[MDQ * MDQ * MDQ]; -/// double sm1[MDQ * MDQ * MDQ]; -/// double(*Xsmem)[MD1][MD1] = (double(*)[MD1][MD1])sm0; -/// double(*DDQ)[MD1][MQ1] = (double(*)[MD1][MQ1])sm1; -/// double(*DQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm0; -/// double(*QQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm1; -/// double(*QQD)[MQ1][MD1] = (double(*)[MQ1][MD1])sm0; -/// double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1; +/// constexpr int MDQ = (MQ1 > ? MQ1 : MD1; +/// double sBG[MQ1*MD1]; +/// double (*B)[MD1] = (double (*)[MD1]) sBG; +/// double (*G)[MD1] = (double (*)[MD1]) sBG; +/// double (*Bt)[MQ1] = (double (*)[MQ1]) sBG; +/// double (*Gt)[MQ1] = (double (*)[MQ1]) sBG; +/// double sm0[3][MDQ*MDQ*MDQ]; +/// double sm1[3][MDQ*MDQ*MDQ]; +/// double (*X)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); +/// double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+0); +/// double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+1); +/// double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+0); +/// double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+1); +/// double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+2); +/// double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+0); +/// double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+1); +/// double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+2); +/// double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+0); +/// double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+1); +/// double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+2); +/// double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+0); +/// double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1); +/// double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); /// /// for(int dy=0; dy MD1) ? MQ1 : MD1; \ - double sDQ[MQ1 * MD1]; \ - double(*Bsmem)[MD1] = (double(*)[MD1])sDQ; \ - double(*Btsmem)[MQ1] = (double(*)[MQ1])sDQ; \ - double sm0[MDQ * MDQ * MDQ]; \ - double sm1[MDQ * MDQ * MDQ]; \ - double(*Xsmem)[MD1][MD1] = (double(*)[MD1][MD1])sm0; \ - double(*DDQ)[MD1][MQ1] = (double(*)[MD1][MQ1])sm1; \ - double(*DQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm0; \ - double(*QQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm1; \ - double(*QQD)[MQ1][MD1] = (double(*)[MQ1][MD1])sm0; \ - double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1; - -#define DIFFUSION3DPA_0_GPU \ - constexpr int MQ1 = Q1D; \ - constexpr int MD1 = D1D; \ - constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ - RAJA_TEAM_SHARED double sDQ[MQ1 * MD1]; \ - double(*Bsmem)[MD1] = (double(*)[MD1])sDQ; \ - double(*Btsmem)[MQ1] = (double(*)[MQ1])sDQ; \ - RAJA_TEAM_SHARED double sm0[MDQ * MDQ * MDQ]; \ - RAJA_TEAM_SHARED double sm1[MDQ * MDQ * MDQ]; \ - double(*Xsmem)[MD1][MD1] = (double(*)[MD1][MD1])sm0; \ - double(*DDQ)[MD1][MQ1] = (double(*)[MD1][MQ1])sm1; \ - double(*DQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm0; \ - double(*QQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm1; \ - double(*QQD)[MQ1][MD1] = (double(*)[MQ1][MD1])sm0; \ - double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1; - -#define DIFFUSION3DPA_1 \ - RAJA_UNROLL(MD1) \ -for (int dz = 0; dz< D1D; ++dz) { \ -Xsmem[dz][dy][dx] = X_(dx, dy, dz, e); \ -} - -#define DIFFUSION3DPA_2 \ - Bsmem[dx][dy] = B_(dx, dy); - -// 2 * D1D * D1D * D1D * Q1D -#define DIFFUSION3DPA_3 \ - double u[D1D]; \ -RAJA_UNROLL(MD1) \ -for (int dz = 0; dz < D1D; dz++) { \ -u[dz] = 0; \ -} \ -RAJA_UNROLL(MD1) \ -for (int dx = 0; dx < D1D; ++dx) { \ -RAJA_UNROLL(MD1) \ -for (int dz = 0; dz < D1D; ++dz) { \ -u[dz] += Xsmem[dz][dy][dx] * Bsmem[qx][dx]; \ -} \ -} \ -RAJA_UNROLL(MD1) \ -for (int dz = 0; dz < D1D; ++dz) { \ -DDQ[dz][dy][qx] = u[dz]; \ -} - -//2 * D1D * D1D * Q1D * Q1D -#define DIFFUSION3DPA_4 \ - double u[D1D]; \ - RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; dz++) { \ - u[dz] = 0; \ - } \ - RAJA_UNROLL(MD1) \ - for (int dy = 0; dy < D1D; ++dy) { \ - RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; dz++) { \ - u[dz] += DDQ[dz][dy][qx] * Bsmem[qy][dy]; \ - } \ - } \ - RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; dz++) { \ - DQQ[dz][qy][qx] = u[dz]; \ - } - -//2 * D1D * Q1D * Q1D * Q1D + Q1D * Q1D * Q1D -#define DIFFUSION3DPA_5 \ - double u[Q1D]; \ - RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; qz++) { \ - u[qz] = 0; \ - } \ - RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; ++dz) { \ - RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; qz++) { \ - u[qz] += DQQ[dz][qy][qx] * Bsmem[qz][dz]; \ - } \ - } \ - RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; qz++) { \ - QQQ[qz][qy][qx] = u[qz] * D_(qx, qy, qz, e); \ - } - -#define DIFFUSION3DPA_6 \ - Btsmem[d][q] = Bt_(q, d); - -//2 * Q1D * Q1D * Q1D * D1D -#define DIFFUSION3DPA_7 \ - double u[Q1D]; \ -RAJA_UNROLL(MQ1) \ -for (int qz = 0; qz < Q1D; ++qz) { \ - u[qz] = 0; \ - } \ -RAJA_UNROLL(MQ1) \ -for (int qx = 0; qx < Q1D; ++qx) { \ - RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; ++qz) { \ - u[qz] += QQQ[qz][qy][qx] * Btsmem[dx][qx]; \ - } \ - } \ -RAJA_UNROLL(MQ1) \ -for (int qz = 0; qz < Q1D; ++qz) { \ - QQD[qz][qy][dx] = u[qz]; \ - } - -// 2 * Q1D * Q1D * D1D * D1D -#define DIFFUSION3DPA_8 \ - double u[Q1D]; \ - RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; ++qz) { \ - u[qz] = 0; \ - } \ - RAJA_UNROLL(MQ1) \ - for (int qy = 0; qy < Q1D; ++qy) { \ - RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; ++qz) { \ - u[qz] += QQD[qz][qy][dx] * Btsmem[dy][qy]; \ - } \ - } \ - RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; ++qz) { \ - QDD[qz][dy][dx] = u[qz]; \ - } - -//2 * Q1D * D1D * D1D * D1D + D1D * D1D * D1D -#define DIFFUSION3DPA_9 \ - double u[D1D]; \ - RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; ++dz) { \ - u[dz] = 0; \ - } \ - RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; ++qz) { \ - RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; ++dz) { \ - u[dz] += QDD[qz][dy][dx] * Btsmem[dz][qz]; \ - } \ - } \ - RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; ++dz) { \ - Y_(dx, dy, dz, e) += u[dz]; \ - } - +#define DIFFUSION3DPA_0_GPU +#define DIFFUSION3DPA_0_CPU +#define DIFFUSION3DPA_1 +#define DIFFUSION3DPA_2 +#define DIFFUSION3DPA_3 +#define DIFFUSION3DPA_4 +#define DIFFUSION3DPA_5 +#define DIFFUSION3DPA_6 +#define DIFFUSION3DPA_7 +#define DIFFUSION3DPA_8 +#define DIFFUSION3DPA_9 #if defined(RAJA_ENABLE_CUDA) using m3d_device_launch = RAJA::expt::cuda_launch_t; From 32198bada12cc149729535d727cddf32d43b6372 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 29 Sep 2021 10:36:11 -0700 Subject: [PATCH 117/392] funcational seq version --- src/apps/DIFFUSION3DPA-Cuda.cpp | 190 +------------------ src/apps/DIFFUSION3DPA-OMP.cpp | 165 ----------------- src/apps/DIFFUSION3DPA-Seq.cpp | 249 +++++++++++-------------- src/apps/DIFFUSION3DPA.cpp | 9 +- src/apps/DIFFUSION3DPA.hpp | 317 +++++++++++++++++++++++++++++--- 5 files changed, 416 insertions(+), 514 deletions(-) diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index 11b97885d..7ff0151d1 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -20,19 +20,19 @@ namespace rajaperf { namespace apps { #define DIFFUSION3DPA_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(B, m_B, Q1D *D1D); \ - allocAndInitCudaDeviceData(Bt, m_Bt, Q1D *D1D); \ + // allocAndInitCudaDeviceData(B, m_B, Q1D *D1D); \ + // allocAndInitCudaDeviceData(Bt, m_Bt, Q1D *D1D); \ allocAndInitCudaDeviceData(D, m_D, Q1D *Q1D *Q1D *m_NE); \ allocAndInitCudaDeviceData(X, m_X, D1D *D1D *D1D *m_NE); \ allocAndInitCudaDeviceData(Y, m_Y, D1D *D1D *D1D *m_NE); #define DIFFUSION3DPA_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_Y, Y, D1D *D1D *D1D *m_NE); \ - deallocCudaDeviceData(B); \ - deallocCudaDeviceData(Bt); \ - deallocCudaDeviceData(D); \ - deallocCudaDeviceData(X); \ - deallocCudaDeviceData(Y); + // getCudaDeviceData(m_Y, Y, D1D *D1D *D1D *m_NE); \ + // deallocCudaDeviceData(B); \ + // deallocCudaDeviceData(Bt); \ + // deallocCudaDeviceData(D); \ + // deallocCudaDeviceData(X); \ + //deallocCudaDeviceData(Y); //#define USE_RAJA_UNROLL #define RAJA_DIRECT_PRAGMA(X) _Pragma(#X) @@ -49,62 +49,6 @@ __global__ void Diffusion3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt const int e = blockIdx.x; - DIFFUSION3DPA_0_GPU - - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D){ - DIFFUSION3DPA_1 - } - FOREACH_THREAD(dx, x, Q1D) { - DIFFUSION3DPA_2 - } - } - __syncthreads(); - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(qx, x, Q1D) { - DIFFUSION3DPA_3 - } - } - __syncthreads(); - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(qx, x, Q1D) { - DIFFUSION3DPA_4 - } - } - __syncthreads(); - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(qx, x, Q1D) { - DIFFUSION3DPA_5 - } - } - - __syncthreads(); - FOREACH_THREAD(d, y, D1D) { - FOREACH_THREAD(q, x, Q1D) { - DIFFUSION3DPA_6 - } - } - - __syncthreads(); - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(dx, x, D1D) { - DIFFUSION3DPA_7 - } - } - __syncthreads(); - - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D) { - DIFFUSION3DPA_8 - } - } - - __syncthreads(); - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D) { - DIFFUSION3DPA_9 - } - } } void DIFFUSION3DPA::runCudaVariant(VariantID vid) { @@ -123,7 +67,7 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { dim3 nthreads_per_block(Q1D, Q1D, 1); - Diffusion3DPA<<>>(NE, B, Bt, D, X, Y); + // Diffusion3DPA<<>>(NE, B, Bt, D, X, Y); cudaErrchk( cudaGetLastError() ); } @@ -157,122 +101,6 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::expt::launch( - RAJA::expt::DEVICE, - RAJA::expt::Grid(RAJA::expt::Teams(NE), - RAJA::expt::Threads(Q1D, Q1D, 1)), - [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), - [&](int e) { - - DIFFUSION3DPA_0_GPU - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dx) { - DIFFUSION3DPA_1 - } - ); // RAJA::expt::loop - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int dx) { - DIFFUSION3DPA_2 - } - ); // RAJA::expt::loop - } // lambda (dy) - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int qx) { - DIFFUSION3DPA_3 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int qx) { - DIFFUSION3DPA_4 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int qx) { - DIFFUSION3DPA_5 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int d) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int q) { - DIFFUSION3DPA_6 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dx) { - DIFFUSION3DPA_7 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dx) { - DIFFUSION3DPA_8 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dx) { - DIFFUSION3DPA_9 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - } // lambda (e) - ); // RAJA::expt::loop - - } // outer lambda (ctx) - ); // RAJA::expt::launch } // loop over kernel reps stopTimer(); diff --git a/src/apps/DIFFUSION3DPA-OMP.cpp b/src/apps/DIFFUSION3DPA-OMP.cpp index e4b1b7a14..d9100c18d 100644 --- a/src/apps/DIFFUSION3DPA-OMP.cpp +++ b/src/apps/DIFFUSION3DPA-OMP.cpp @@ -41,58 +41,6 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { #pragma omp parallel for for (int e = 0; e < NE; ++e) { - DIFFUSION3DPA_0_CPU - - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D){ - DIFFUSION3DPA_1 - } - FOREACH_THREAD(dx, x, Q1D) { - DIFFUSION3DPA_2 - } - } - - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(qx, x, Q1D) { - DIFFUSION3DPA_3 - } - } - - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(qx, x, Q1D) { - DIFFUSION3DPA_4 - } - } - - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(qx, x, Q1D) { - DIFFUSION3DPA_5 - } - } - - FOREACH_THREAD(d, y, D1D) { - FOREACH_THREAD(q, x, Q1D) { - DIFFUSION3DPA_6 - } - } - - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(dx, x, D1D) { - DIFFUSION3DPA_7 - } - } - - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D) { - DIFFUSION3DPA_8 - } - } - - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D) { - DIFFUSION3DPA_9 - } - } } // element loop } @@ -132,120 +80,7 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { for (RepIndex_type irep = 0; irep < run_reps; ++irep) { //Grid is empty as the host does not need a compute grid to be specified - RAJA::expt::launch( - RAJA::expt::HOST, RAJA::expt::Grid(), - [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), - [&](int e) { - - DIFFUSION3DPA_0_CPU - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dx) { - DIFFUSION3DPA_1 - } - ); // RAJA::expt::loop - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int dx) { - DIFFUSION3DPA_2 - } - ); // RAJA::expt::loop - } // lambda (dy) - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int qx) { - DIFFUSION3DPA_3 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int qx) { - DIFFUSION3DPA_4 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int qx) { - DIFFUSION3DPA_5 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int d) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int q) { - DIFFUSION3DPA_6 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dx) { - DIFFUSION3DPA_7 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dx) { - DIFFUSION3DPA_8 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dx) { - DIFFUSION3DPA_9 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - } // lambda (e) - ); // RAJA::expt::loop - } // outer lambda (ctx) - ); // // RAJA::expt::launch } // loop over kernel reps stopTimer(); diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index b72713b4b..7881d51cd 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -24,6 +24,9 @@ namespace apps { #endif #define FOREACH_THREAD(i, k, N) for (int i = 0; i < N; i++) +#define MFEM_SHARED +#define MFEM_SYNC_THREAD + void DIFFUSION3DPA::runSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -38,56 +41,57 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { for (int e = 0; e < NE; ++e) { - DIFFUSION3DPA_0_CPU + DIFFUSION3DPA_0_CPU; FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D){ - DIFFUSION3DPA_1 + FOREACH_THREAD(dx, x, D1D) { + DIFFUSION3DPA_1; } - FOREACH_THREAD(dx, x, Q1D) { - DIFFUSION3DPA_2 + FOREACH_THREAD(qx, x, Q1D) { + DIFFUSION3DPA_2; } } + MFEM_SYNC_THREAD; FOREACH_THREAD(dy, y, D1D) { FOREACH_THREAD(qx, x, Q1D) { - DIFFUSION3DPA_3 + DIFFUSION3DPA_3; } } - + MFEM_SYNC_THREAD; FOREACH_THREAD(qy, y, Q1D) { FOREACH_THREAD(qx, x, Q1D) { - DIFFUSION3DPA_4 + DIFFUSION3DPA_4; } } - + MFEM_SYNC_THREAD; FOREACH_THREAD(qy, y, Q1D) { FOREACH_THREAD(qx, x, Q1D) { - DIFFUSION3DPA_5 + DIFFUSION3DPA_5; } } - + MFEM_SYNC_THREAD; FOREACH_THREAD(d, y, D1D) { FOREACH_THREAD(q, x, Q1D) { - DIFFUSION3DPA_6 + DIFFUSION3DPA_6; } } - + MFEM_SYNC_THREAD; FOREACH_THREAD(qy, y, Q1D) { FOREACH_THREAD(dx, x, D1D) { - DIFFUSION3DPA_7 + DIFFUSION3DPA_7; } } - + MFEM_SYNC_THREAD; FOREACH_THREAD(dy, y, D1D) { FOREACH_THREAD(dx, x, D1D) { - DIFFUSION3DPA_8 + DIFFUSION3DPA_8; } } - + MFEM_SYNC_THREAD; FOREACH_THREAD(dy, y, D1D) { FOREACH_THREAD(dx, x, D1D) { - DIFFUSION3DPA_9 + DIFFUSION3DPA_9; } } @@ -101,150 +105,110 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { #if defined(RUN_RAJA_SEQ) case RAJA_Seq: { - //Currently Teams requires two policies if compiled with a device + // Currently Teams requires two policies if compiled with a device using launch_policy = RAJA::expt::LaunchPolicy; using outer_x = RAJA::expt::LoopPolicy; using inner_x = RAJA::expt::LoopPolicy; + >; using inner_y = RAJA::expt::LoopPolicy; + >; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::expt::launch( - RAJA::expt::HOST, RAJA::expt::Grid(), - [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), - [&](int e) { - - DIFFUSION3DPA_0_CPU - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dx) { - DIFFUSION3DPA_1 - } - ); // RAJA::expt::loop - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int dx) { - DIFFUSION3DPA_2 - } - ); // RAJA::expt::loop - } // lambda (dy) - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int qx) { - DIFFUSION3DPA_3 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int qx) { - DIFFUSION3DPA_4 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int qx) { - DIFFUSION3DPA_5 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int d) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int q) { - DIFFUSION3DPA_6 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dx) { - DIFFUSION3DPA_7 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dx) { - DIFFUSION3DPA_8 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dx) { - DIFFUSION3DPA_9 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - } // lambda (e) - ); // RAJA::expt::loop - - } // outer lambda (ctx) - ); // RAJA::expt::launch - - } // loop over kernel reps + RAJA::expt::launch + (RAJA::expt::HOST, RAJA::expt::Grid(), + [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), [&](int e) { + + DIFFUSION3DPA_0_CPU; + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), [&](int dx) { + DIFFUSION3DPA_1; + }); + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), [&](int qx) { + DIFFUSION3DPA_2; + }); + }); + + ctx.teamSync(); + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), [&](int qx) { + DIFFUSION3DPA_3; + }); + }); + + ctx.teamSync(); + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), [&](int qx) { + DIFFUSION3DPA_4; + }); + }); + + ctx.teamSync(); + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), [&](int qx) { + DIFFUSION3DPA_5; + }); + }); + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), [&](int d) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), [&](int q) { + DIFFUSION3DPA_6; + }); + }); + + ctx.teamSync(); + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), [&](int dx) { + DIFFUSION3DPA_7; + }); + }); + + + ctx.teamSync(); + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), [&](int dx) { + DIFFUSION3DPA_8; + }); + }); + + ctx.teamSync(); + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), [&](int dx) { + DIFFUSION3DPA_9; + }); + }); + + }); + }); // RAJA::expt::launch + + } // loop over kernel reps stopTimer(); return; @@ -252,7 +216,8 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { #endif // RUN_RAJA_SEQ default: - std::cout << "\n DIFFUSION3DPA : Unknown Seq variant id = " << vid << std::endl; + std::cout << "\n DIFFUSION3DPA : Unknown Seq variant id = " << vid + << std::endl; } } diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp index 16889aa6b..03c8cf4ef 100644 --- a/src/apps/DIFFUSION3DPA.cpp +++ b/src/apps/DIFFUSION3DPA.cpp @@ -35,8 +35,7 @@ DIFFUSION3DPA::DIFFUSION3DPA(const RunParams& params) setItsPerRep(getActualProblemSize()); setKernelsPerRep(1); - setBytesPerRep( Q1D*D1D*sizeof(Real_type) + - Q1D*D1D*sizeof(Real_type) + + setBytesPerRep( 4*Q1D*D1D*sizeof(Real_type) + Q1D*Q1D*Q1D*m_NE*sizeof(Real_type) + D1D*D1D*D1D*m_NE*sizeof(Real_type) + D1D*D1D*D1D*m_NE*sizeof(Real_type) ); @@ -71,8 +70,8 @@ void DIFFUSION3DPA::setUp(VariantID vid) { allocAndInitDataConst(m_B, int(Q1D*D1D), Real_type(1.0), vid); - allocAndInitDataConst(m_Bt,int(Q1D*D1D), Real_type(1.0), vid); - allocAndInitDataConst(m_D, int(Q1D*Q1D*Q1D*m_NE), Real_type(1.0), vid); + allocAndInitDataConst(m_G, int(Q1D*D1D), Real_type(1.0), vid); + allocAndInitDataConst(m_D, int(Q1D*Q1D*Q1D*SYM*m_NE), Real_type(1.0), vid); allocAndInitDataConst(m_X, int(D1D*D1D*D1D*m_NE), Real_type(1.0), vid); allocAndInitDataConst(m_Y, int(D1D*D1D*D1D*m_NE), Real_type(0.0), vid); } @@ -87,7 +86,7 @@ void DIFFUSION3DPA::tearDown(VariantID vid) (void) vid; deallocData(m_B); - deallocData(m_Bt); + deallocData(m_G); deallocData(m_D); deallocData(m_X); deallocData(m_Y); diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index 8442bb1a5..43c448de0 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -284,40 +284,313 @@ #define RAJAPerf_Apps_DIFFUSION3DPA_HPP #define DIFFUSION3DPA_DATA_SETUP \ -Real_ptr B = m_B; \ -Real_ptr Bt = m_Bt; \ +Real_ptr Basis = m_B; \ +Real_ptr dBasis = m_G; \ Real_ptr D = m_D; \ Real_ptr X = m_X; \ Real_ptr Y = m_Y; \ -Index_type NE = m_NE; +Index_type NE = m_NE; \ +const bool symmetric = true; #include "common/KernelBase.hpp" #include "RAJA/RAJA.hpp" //Number of Dofs/Qpts in 1D -#define D1D 4 -#define Q1D 5 -#define B_(x, y) B[x + Q1D * y] -#define Bt_(x, y) Bt[x + D1D * y] +#define D1D 3 +#define Q1D 4 +#define SYM 6 +#define b(x, y) Basis[x + Q1D * y] +#define g(x, y) dBasis[x + Q1D * y] #define X_(dx, dy, dz, e) \ X[dx + D1D * dy + D1D * D1D * dz + D1D * D1D * D1D * e] #define Y_(dx, dy, dz, e) \ Y[dx + D1D * dy + D1D * D1D * dz + D1D * D1D * D1D * e] -#define D_(qx, qy, qz, e) \ - D[qx + Q1D * qy + Q1D * Q1D * qz + Q1D * Q1D * Q1D * e] - -#define DIFFUSION3DPA_0_GPU -#define DIFFUSION3DPA_0_CPU -#define DIFFUSION3DPA_1 -#define DIFFUSION3DPA_2 -#define DIFFUSION3DPA_3 -#define DIFFUSION3DPA_4 -#define DIFFUSION3DPA_5 -#define DIFFUSION3DPA_6 -#define DIFFUSION3DPA_7 -#define DIFFUSION3DPA_8 -#define DIFFUSION3DPA_9 +#define d(qx, qy, qz, s, e) \ + D[qx + Q1D * qy + Q1D * Q1D * qz + Q1D * Q1D * Q1D * s + Q1D * Q1D * Q1D * SYM * e] + +// Half of B and G are stored in shared to get B, Bt, G and Gt. +// Indices computation for SmemPADiffusionApply3D. +static RAJA_HOST_DEVICE inline int qi(const int q, const int d, const int Q) +{ + return (q<=d) ? q : Q-1-q; +} + +static RAJA_HOST_DEVICE inline int dj(const int q, const int d, const int D) +{ + return (q<=d) ? d : D-1-d; +} + +static RAJA_HOST_DEVICE inline int qk(const int q, const int d, const int Q) +{ + return (q<=d) ? Q-1-q : q; +} + +static RAJA_HOST_DEVICE inline int dl(const int q, const int d, const int D) +{ + return (q<=d) ? D-1-d : d; +} + +static RAJA_HOST_DEVICE inline double sign(const int q, const int d) +{ + return (q<=d) ? -1.0 : 1.0; +} + + +#define DIFFUSION3DPA_0_GPU \ + constexpr int MQ1 = Q1D; \ + constexpr int MD1 = D1D; \ + constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + RAJA_TEAM_SHARED double sBG[MQ1*MD1]; \ + double (*B)[MD1] = (double (*)[MD1]) sBG; \ + double (*G)[MD1] = (double (*)[MD1]) sBG; \ + double (*Bt)[MQ1] = (double (*)[MQ1]) sBG; \ + double (*Gt)[MQ1] = (double (*)[MQ1]) sBG; \ + RAJA_TEAM_SHARED double sm0[3][MDQ*MDQ*MDQ]; \ + RAJA_TEAM_SHARED double sm1[3][MDQ*MDQ*MDQ]; \ + double (*s_X)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); \ + double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+0); \ + double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+1); \ + double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+0); \ + double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+1); \ + double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+2); \ + double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+0); \ + double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+1); \ + double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+2); \ + double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+0); \ + double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+1); \ + double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+2); \ + double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+0); \ + double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1); \ + double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); + + +#define DIFFUSION3DPA_0_CPU \ + constexpr int MQ1 = Q1D; \ + constexpr int MD1 = D1D; \ + constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + double sBG[MQ1*MD1]; \ + double (*B)[MD1] = (double (*)[MD1]) sBG; \ + double (*G)[MD1] = (double (*)[MD1]) sBG; \ + double (*Bt)[MQ1] = (double (*)[MQ1]) sBG; \ + double (*Gt)[MQ1] = (double (*)[MQ1]) sBG; \ + double sm0[3][MDQ*MDQ*MDQ]; \ + double sm1[3][MDQ*MDQ*MDQ]; \ + double (*s_X)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); \ + double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+0); \ + double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+1); \ + double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+0); \ + double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+1); \ + double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+2); \ + double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+0); \ + double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+1); \ + double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+2); \ + double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+0); \ + double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+1); \ + double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+2); \ + double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+0); \ + double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1); \ + double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); + +#define DIFFUSION3DPA_1 \ + RAJA_UNROLL(MD1) \ + for (int dz = 0; dz < D1D; ++dz) \ + { \ + s_X[dz][dy][dx] = X_(dx,dy,dz,e); \ + } + +#define DIFFUSION3DPA_2 \ + const int i = qi(qx,dy,Q1D); \ + const int j = dj(qx,dy,D1D); \ + const int k = qk(qx,dy,Q1D); \ + const int l = dl(qx,dy,D1D); \ + B[i][j] = b(qx,dy); \ + G[k][l] = g(qx,dy) * sign(qx,dy); + +#define DIFFUSION3DPA_3 \ + double u[D1D], v[D1D]; \ + RAJA_UNROLL(MD1) \ + for (int dz = 0; dz < D1D; dz++) { u[dz] = v[dz] = 0.0; } \ + RAJA_UNROLL(MD1) \ + for (int dx = 0; dx < D1D; ++dx) \ + { \ + const int i = qi(qx,dx,Q1D); \ + const int j = dj(qx,dx,D1D); \ + const int k = qk(qx,dx,Q1D); \ + const int l = dl(qx,dx,D1D); \ + const double s = sign(qx,dx); \ + RAJA_UNROLL(MD1) \ + for (int dz = 0; dz < D1D; ++dz) \ + { \ + const double coords = s_X[dz][dy][dx]; \ + u[dz] += coords * B[i][j]; \ + v[dz] += coords * G[k][l] * s; \ + } \ + } \ + RAJA_UNROLL(MD1) \ + for (int dz = 0; dz < D1D; ++dz) \ + { \ + DDQ0[dz][dy][qx] = u[dz]; \ + DDQ1[dz][dy][qx] = v[dz]; \ + } + +#define DIFFUSION3DPA_4 \ + double u[D1D], v[D1D], w[D1D]; \ + RAJA_UNROLL(MD1) \ + for (int dz = 0; dz < D1D; dz++) { u[dz] = v[dz] = w[dz] = 0.0; } \ + RAJA_UNROLL(MD1) \ + for (int dy = 0; dy < D1D; ++dy) \ + { \ + const int i = qi(qy,dy,Q1D); \ + const int j = dj(qy,dy,D1D); \ + const int k = qk(qy,dy,Q1D); \ + const int l = dl(qy,dy,D1D); \ + const double s = sign(qy,dy); \ + RAJA_UNROLL(MD1) \ + for (int dz = 0; dz < D1D; dz++) \ + { \ + u[dz] += DDQ1[dz][dy][qx] * B[i][j]; \ + v[dz] += DDQ0[dz][dy][qx] * G[k][l] * s; \ + w[dz] += DDQ0[dz][dy][qx] * B[i][j]; \ + } \ + } \ + RAJA_UNROLL(MD1) \ + for (int dz = 0; dz < D1D; dz++) \ + { \ + DQQ0[dz][qy][qx] = u[dz]; \ + DQQ1[dz][qy][qx] = v[dz]; \ + DQQ2[dz][qy][qx] = w[dz]; \ + } + +#define DIFFUSION3DPA_5 \ + double u[Q1D], v[Q1D], w[Q1D]; \ + RAJA_UNROLL(MQ1) \ + for (int qz = 0; qz < Q1D; qz++) { u[qz] = v[qz] = w[qz] = 0.0; } \ + RAJA_UNROLL(MD1) \ + for (int dz = 0; dz < D1D; ++dz) \ + { \ + RAJA_UNROLL(MQ1) \ + for (int qz = 0; qz < Q1D; qz++) \ + { \ + const int i = qi(qz,dz,Q1D); \ + const int j = dj(qz,dz,D1D); \ + const int k = qk(qz,dz,Q1D); \ + const int l = dl(qz,dz,D1D); \ + const double s = sign(qz,dz); \ + u[qz] += DQQ0[dz][qy][qx] * B[i][j]; \ + v[qz] += DQQ1[dz][qy][qx] * B[i][j]; \ + w[qz] += DQQ2[dz][qy][qx] * G[k][l] * s; \ + } \ + } \ + RAJA_UNROLL(MQ1) \ + for (int qz = 0; qz < Q1D; qz++) \ + { \ + const double O11 = d(qx,qy,qz,0,e); \ + const double O12 = d(qx,qy,qz,1,e); \ + const double O13 = d(qx,qy,qz,2,e); \ + const double O21 = symmetric ? O12 : d(qx,qy,qz,3,e); \ + const double O22 = symmetric ? d(qx,qy,qz,3,e) : d(qx,qy,qz,4,e); \ + const double O23 = symmetric ? d(qx,qy,qz,4,e) : d(qx,qy,qz,5,e); \ + const double O31 = symmetric ? O13 : d(qx,qy,qz,6,e); \ + const double O32 = symmetric ? O23 : d(qx,qy,qz,7,e); \ + const double O33 = symmetric ? d(qx,qy,qz,5,e) : d(qx,qy,qz,8,e); \ + const double gX = u[qz]; \ + const double gY = v[qz]; \ + const double gZ = w[qz]; \ + QQQ0[qz][qy][qx] = (O11*gX) + (O12*gY) + (O13*gZ); \ + QQQ1[qz][qy][qx] = (O21*gX) + (O22*gY) + (O23*gZ); \ + QQQ2[qz][qy][qx] = (O31*gX) + (O32*gY) + (O33*gZ); \ + } + +#define DIFFUSION3DPA_6 \ + const int i = qi(q,d,Q1D); \ + const int j = dj(q,d,D1D); \ + const int k = qk(q,d,Q1D); \ + const int l = dl(q,d,D1D); \ + Bt[j][i] = b(q,d); \ + Gt[l][k] = g(q,d) * sign(q,d); + +#define DIFFUSION3DPA_7 \ + double u[Q1D], v[Q1D], w[Q1D]; \ + RAJA_UNROLL(MQ1) \ + for (int qz = 0; qz < Q1D; ++qz) { u[qz] = v[qz] = w[qz] = 0.0; } \ + RAJA_UNROLL(MQ1) \ + for (int qx = 0; qx < Q1D; ++qx) \ + { \ + const int i = qi(qx,dx,Q1D); \ + const int j = dj(qx,dx,D1D); \ + const int k = qk(qx,dx,Q1D); \ + const int l = dl(qx,dx,D1D); \ + const double s = sign(qx,dx); \ + RAJA_UNROLL(MQ1) \ + for (int qz = 0; qz < Q1D; ++qz) \ + { \ + u[qz] += QQQ0[qz][qy][qx] * Gt[l][k] * s; \ + v[qz] += QQQ1[qz][qy][qx] * Bt[j][i]; \ + w[qz] += QQQ2[qz][qy][qx] * Bt[j][i]; \ + } \ + } \ + RAJA_UNROLL(MQ1) \ + for (int qz = 0; qz < Q1D; ++qz) \ + { \ + QQD0[qz][qy][dx] = u[qz]; \ + QQD1[qz][qy][dx] = v[qz]; \ + QQD2[qz][qy][dx] = w[qz]; \ + } + +#define DIFFUSION3DPA_8 \ + double u[Q1D], v[Q1D], w[Q1D]; \ + RAJA_UNROLL(MQ1) \ + for (int qz = 0; qz < Q1D; ++qz) { u[qz] = v[qz] = w[qz] = 0.0; } \ + RAJA_UNROLL(MQ1) \ + for (int qy = 0; qy < Q1D; ++qy) \ + { \ + const int i = qi(qy,dy,Q1D); \ + const int j = dj(qy,dy,D1D); \ + const int k = qk(qy,dy,Q1D); \ + const int l = dl(qy,dy,D1D); \ + const double s = sign(qy,dy); \ + RAJA_UNROLL(MQ1) \ + for (int qz = 0; qz < Q1D; ++qz) \ + { \ + u[qz] += QQD0[qz][qy][dx] * Bt[j][i]; \ + v[qz] += QQD1[qz][qy][dx] * Gt[l][k] * s; \ + w[qz] += QQD2[qz][qy][dx] * Bt[j][i]; \ + } \ + } \ + RAJA_UNROLL(MQ1) \ + for (int qz = 0; qz < Q1D; ++qz) \ + { \ + QDD0[qz][dy][dx] = u[qz]; \ + QDD1[qz][dy][dx] = v[qz]; \ + QDD2[qz][dy][dx] = w[qz]; \ + } \ + +#define DIFFUSION3DPA_9 \ + double u[D1D], v[D1D], w[D1D]; \ + RAJA_UNROLL(MD1) \ + for (int dz = 0; dz < D1D; ++dz) { u[dz] = v[dz] = w[dz] = 0.0; } \ + RAJA_UNROLL(MQ1) \ + for (int qz = 0; qz < Q1D; ++qz) \ + { \ + RAJA_UNROLL(MD1) \ + for (int dz = 0; dz < D1D; ++dz) \ + { \ + const int i = qi(qz,dz,Q1D); \ + const int j = dj(qz,dz,D1D); \ + const int k = qk(qz,dz,Q1D); \ + const int l = dl(qz,dz,D1D); \ + const double s = sign(qz,dz); \ + u[dz] += QDD0[qz][dy][dx] * Bt[j][i]; \ + v[dz] += QDD1[qz][dy][dx] * Bt[j][i]; \ + w[dz] += QDD2[qz][dy][dx] * Gt[l][k] * s;\ + } \ + } \ + RAJA_UNROLL(MD1) \ + for (int dz = 0; dz < D1D; ++dz) \ + { \ + Y_(dx,dy,dz,e) += (u[dz] + v[dz] + w[dz]); \ + } #if defined(RAJA_ENABLE_CUDA) using m3d_device_launch = RAJA::expt::cuda_launch_t; @@ -362,6 +635,8 @@ class DIFFUSION3DPA : public KernelBase Real_ptr m_B; Real_ptr m_Bt; + Real_ptr m_G; + Real_ptr m_Gt; Real_ptr m_D; Real_ptr m_X; Real_ptr m_Y; From 685c80553570403e4033785dc2ae08d69e7966fe Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 29 Sep 2021 11:00:36 -0700 Subject: [PATCH 118/392] formatting pass --- src/apps/DIFFUSION3DPA-Seq.cpp | 175 ++++++++++++++++++++------------- 1 file changed, 108 insertions(+), 67 deletions(-) diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index 7881d51cd..f683e54e0 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -137,77 +137,118 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::expt::launch - (RAJA::expt::HOST, RAJA::expt::Grid(), - [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + RAJA::expt::launch( + RAJA::expt::HOST, RAJA::expt::Grid(), + [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), [&](int e) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), + [&](int e) { DIFFUSION3DPA_0_CPU; - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), [&](int dx) { - DIFFUSION3DPA_1; - }); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), [&](int qx) { - DIFFUSION3DPA_2; - }); - }); - - ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), [&](int qx) { - DIFFUSION3DPA_3; - }); - }); - - ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), [&](int qx) { - DIFFUSION3DPA_4; - }); - }); - - ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), [&](int qx) { - DIFFUSION3DPA_5; - }); - }); - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), [&](int d) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), [&](int q) { - DIFFUSION3DPA_6; - }); - }); - - ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), [&](int dx) { - DIFFUSION3DPA_7; - }); - }); - - - ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), [&](int dx) { - DIFFUSION3DPA_8; - }); - }); - - ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), [&](int dx) { - DIFFUSION3DPA_9; - }); - }); - - }); - }); // RAJA::expt::launch - + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dx) { + DIFFUSION3DPA_1; + } + ); // RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qx) { + DIFFUSION3DPA_2; + } + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qx) { + DIFFUSION3DPA_3; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qx) { + DIFFUSION3DPA_4; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qx) { + DIFFUSION3DPA_5; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int d) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int q) { + DIFFUSION3DPA_6; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dx) { + DIFFUSION3DPA_7; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dx) { + DIFFUSION3DPA_8; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + [&](int dx) { + DIFFUSION3DPA_9; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + } // lambda (e) + ); // RAJA::expt::loop + + } // outer lambda (ctx) + ); // RAJA::expt::launch } // loop over kernel reps stopTimer(); From da7251ea8d2738ebe69d2b85ca9ef1972c5ffeb2 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 29 Sep 2021 11:20:26 -0700 Subject: [PATCH 119/392] D1D->DPA_D1D + Q1D->DPA_Q1D --- src/apps/DIFFUSION3DPA-Cuda.cpp | 14 +- src/apps/DIFFUSION3DPA-Seq.cpp | 68 +++---- src/apps/DIFFUSION3DPA.cpp | 40 ++-- src/apps/DIFFUSION3DPA.hpp | 320 ++++++++++++++++---------------- 4 files changed, 221 insertions(+), 221 deletions(-) diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index 7ff0151d1..a249162e4 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -20,14 +20,14 @@ namespace rajaperf { namespace apps { #define DIFFUSION3DPA_DATA_SETUP_CUDA \ - // allocAndInitCudaDeviceData(B, m_B, Q1D *D1D); \ - // allocAndInitCudaDeviceData(Bt, m_Bt, Q1D *D1D); \ - allocAndInitCudaDeviceData(D, m_D, Q1D *Q1D *Q1D *m_NE); \ - allocAndInitCudaDeviceData(X, m_X, D1D *D1D *D1D *m_NE); \ - allocAndInitCudaDeviceData(Y, m_Y, D1D *D1D *D1D *m_NE); + // allocAndInitCudaDeviceData(B, m_B, DPA_Q1D *DPA_D1D); \ + // allocAndInitCudaDeviceData(Bt, m_Bt, DPA_Q1D *DPA_D1D); \ + allocAndInitCudaDeviceData(D, m_D, DPA_Q1D *DPA_Q1D *DPA_Q1D *m_NE); \ + allocAndInitCudaDeviceData(X, m_X, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ + allocAndInitCudaDeviceData(Y, m_Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); #define DIFFUSION3DPA_DATA_TEARDOWN_CUDA \ - // getCudaDeviceData(m_Y, Y, D1D *D1D *D1D *m_NE); \ + // getCudaDeviceData(m_Y, Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ // deallocCudaDeviceData(B); \ // deallocCudaDeviceData(Bt); \ // deallocCudaDeviceData(D); \ @@ -65,7 +65,7 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - dim3 nthreads_per_block(Q1D, Q1D, 1); + dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, 1); // Diffusion3DPA<<>>(NE, B, Bt, D, X, Y); diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index f683e54e0..968dbe806 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -43,54 +43,54 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { DIFFUSION3DPA_0_CPU; - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D) { + FOREACH_THREAD(dy, y, DPA_D1D) { + FOREACH_THREAD(dx, x, DPA_D1D) { DIFFUSION3DPA_1; } - FOREACH_THREAD(qx, x, Q1D) { + FOREACH_THREAD(qx, x, DPA_Q1D) { DIFFUSION3DPA_2; } } MFEM_SYNC_THREAD; - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(qx, x, Q1D) { + FOREACH_THREAD(dy, y, DPA_D1D) { + FOREACH_THREAD(qx, x, DPA_Q1D) { DIFFUSION3DPA_3; } } MFEM_SYNC_THREAD; - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(qx, x, Q1D) { + FOREACH_THREAD(qy, y, DPA_Q1D) { + FOREACH_THREAD(qx, x, DPA_Q1D) { DIFFUSION3DPA_4; } } MFEM_SYNC_THREAD; - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(qx, x, Q1D) { + FOREACH_THREAD(qy, y, DPA_Q1D) { + FOREACH_THREAD(qx, x, DPA_Q1D) { DIFFUSION3DPA_5; } } MFEM_SYNC_THREAD; - FOREACH_THREAD(d, y, D1D) { - FOREACH_THREAD(q, x, Q1D) { + FOREACH_THREAD(d, y, DPA_D1D) { + FOREACH_THREAD(q, x, DPA_Q1D) { DIFFUSION3DPA_6; } } MFEM_SYNC_THREAD; - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(dx, x, D1D) { + FOREACH_THREAD(qy, y, DPA_Q1D) { + FOREACH_THREAD(dx, x, DPA_D1D) { DIFFUSION3DPA_7; } } MFEM_SYNC_THREAD; - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D) { + FOREACH_THREAD(dy, y, DPA_D1D) { + FOREACH_THREAD(dx, x, DPA_D1D) { DIFFUSION3DPA_8; } } MFEM_SYNC_THREAD; - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D) { + FOREACH_THREAD(dy, y, DPA_D1D) { + FOREACH_THREAD(dx, x, DPA_D1D) { DIFFUSION3DPA_9; } } @@ -146,14 +146,14 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { DIFFUSION3DPA_0_CPU; - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { DIFFUSION3DPA_1; } ); // RAJA::expt::loop - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qx) { DIFFUSION3DPA_2; } @@ -163,9 +163,9 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qx) { DIFFUSION3DPA_3; } @@ -175,9 +175,9 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qx) { DIFFUSION3DPA_4; } @@ -187,9 +187,9 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qx) { DIFFUSION3DPA_5; } @@ -199,9 +199,9 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int d) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int q) { DIFFUSION3DPA_6; } @@ -211,9 +211,9 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { DIFFUSION3DPA_7; } @@ -223,9 +223,9 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { DIFFUSION3DPA_8; } @@ -234,9 +234,9 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { ); // RAJA::expt::loop ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { DIFFUSION3DPA_9; } diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp index 03c8cf4ef..6f046cc20 100644 --- a/src/apps/DIFFUSION3DPA.cpp +++ b/src/apps/DIFFUSION3DPA.cpp @@ -25,27 +25,27 @@ DIFFUSION3DPA::DIFFUSION3DPA(const RunParams& params) { m_NE_default = 8000; - setDefaultProblemSize(m_NE_default*Q1D*Q1D*Q1D); + setDefaultProblemSize(m_NE_default*DPA_Q1D*DPA_Q1D*DPA_Q1D); setDefaultReps(50); - m_NE = std::max(getTargetProblemSize()/(Q1D*Q1D*Q1D), Index_type(1)); + m_NE = std::max(getTargetProblemSize()/(DPA_Q1D*DPA_Q1D*DPA_Q1D), Index_type(1)); - setActualProblemSize( m_NE*Q1D*Q1D*Q1D ); + setActualProblemSize( m_NE*DPA_Q1D*DPA_Q1D*DPA_Q1D ); setItsPerRep(getActualProblemSize()); setKernelsPerRep(1); - setBytesPerRep( 4*Q1D*D1D*sizeof(Real_type) + - Q1D*Q1D*Q1D*m_NE*sizeof(Real_type) + - D1D*D1D*D1D*m_NE*sizeof(Real_type) + - D1D*D1D*D1D*m_NE*sizeof(Real_type) ); - - setFLOPsPerRep(m_NE * (2 * D1D * D1D * D1D * Q1D + - 2 * D1D * D1D * Q1D * Q1D + - 2 * D1D * Q1D * Q1D * Q1D + Q1D * Q1D * Q1D + - 2 * Q1D * Q1D * Q1D * D1D + - 2 * Q1D * Q1D * D1D * D1D + - 2 * Q1D * D1D * D1D * D1D + D1D * D1D * D1D)); + setBytesPerRep( 4*DPA_Q1D*DPA_D1D*sizeof(Real_type) + + DPA_Q1D*DPA_Q1D*DPA_Q1D*m_NE*sizeof(Real_type) + + DPA_D1D*DPA_D1D*DPA_D1D*m_NE*sizeof(Real_type) + + DPA_D1D*DPA_D1D*DPA_D1D*m_NE*sizeof(Real_type) ); + + setFLOPsPerRep(m_NE * (2 * DPA_D1D * DPA_D1D * DPA_D1D * DPA_Q1D + + 2 * DPA_D1D * DPA_D1D * DPA_Q1D * DPA_Q1D + + 2 * DPA_D1D * DPA_Q1D * DPA_Q1D * DPA_Q1D + DPA_Q1D * DPA_Q1D * DPA_Q1D + + 2 * DPA_Q1D * DPA_Q1D * DPA_Q1D * DPA_D1D + + 2 * DPA_Q1D * DPA_Q1D * DPA_D1D * DPA_D1D + + 2 * DPA_Q1D * DPA_D1D * DPA_D1D * DPA_D1D + DPA_D1D * DPA_D1D * DPA_D1D)); setUsesFeature(Teams); setVariantDefined( Base_Seq ); @@ -69,16 +69,16 @@ DIFFUSION3DPA::~DIFFUSION3DPA() void DIFFUSION3DPA::setUp(VariantID vid) { - allocAndInitDataConst(m_B, int(Q1D*D1D), Real_type(1.0), vid); - allocAndInitDataConst(m_G, int(Q1D*D1D), Real_type(1.0), vid); - allocAndInitDataConst(m_D, int(Q1D*Q1D*Q1D*SYM*m_NE), Real_type(1.0), vid); - allocAndInitDataConst(m_X, int(D1D*D1D*D1D*m_NE), Real_type(1.0), vid); - allocAndInitDataConst(m_Y, int(D1D*D1D*D1D*m_NE), Real_type(0.0), vid); + allocAndInitDataConst(m_B, int(DPA_Q1D*DPA_D1D), Real_type(1.0), vid); + allocAndInitDataConst(m_G, int(DPA_Q1D*DPA_D1D), Real_type(1.0), vid); + allocAndInitDataConst(m_D, int(DPA_Q1D*DPA_Q1D*DPA_Q1D*SYM*m_NE), Real_type(1.0), vid); + allocAndInitDataConst(m_X, int(DPA_D1D*DPA_D1D*DPA_D1D*m_NE), Real_type(1.0), vid); + allocAndInitDataConst(m_Y, int(DPA_D1D*DPA_D1D*DPA_D1D*m_NE), Real_type(0.0), vid); } void DIFFUSION3DPA::updateChecksum(VariantID vid) { - checksum[vid] += calcChecksum(m_Y, D1D*D1D*D1D*m_NE); + checksum[vid] += calcChecksum(m_Y, DPA_D1D*DPA_D1D*DPA_D1D*m_NE); } void DIFFUSION3DPA::tearDown(VariantID vid) diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index 43c448de0..183593c04 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -15,8 +15,8 @@ /// /// for (int e = 0; e < NE; ++e) { /// -/// constexpr int MQ1 = Q1D; -/// constexpr int MD1 = D1D; +/// constexpr int MQ1 = DPA_Q1D; +/// constexpr int MD1 = DPA_D1D; /// constexpr int MDQ = (MQ1 > ? MQ1 : MD1; /// double sBG[MQ1*MD1]; /// double (*B)[MD1] = (double (*)[MD1]) sBG; @@ -41,40 +41,40 @@ /// double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1); /// double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); /// -/// for(int dy=0; dy MD1) ? MQ1 : MD1; \ RAJA_TEAM_SHARED double sBG[MQ1*MD1]; \ double (*B)[MD1] = (double (*)[MD1]) sBG; \ @@ -366,8 +366,8 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) #define DIFFUSION3DPA_0_CPU \ - constexpr int MQ1 = Q1D; \ - constexpr int MD1 = D1D; \ + constexpr int MQ1 = DPA_Q1D; \ + constexpr int MD1 = DPA_D1D; \ constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ double sBG[MQ1*MD1]; \ double (*B)[MD1] = (double (*)[MD1]) sBG; \ @@ -394,33 +394,33 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) #define DIFFUSION3DPA_1 \ RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; ++dz) \ + for (int dz = 0; dz < DPA_D1D; ++dz) \ { \ - s_X[dz][dy][dx] = X_(dx,dy,dz,e); \ + s_X[dz][dy][dx] = dpaX_(dx,dy,dz,e); \ } #define DIFFUSION3DPA_2 \ - const int i = qi(qx,dy,Q1D); \ - const int j = dj(qx,dy,D1D); \ - const int k = qk(qx,dy,Q1D); \ - const int l = dl(qx,dy,D1D); \ + const int i = qi(qx,dy,DPA_Q1D); \ + const int j = dj(qx,dy,DPA_D1D); \ + const int k = qk(qx,dy,DPA_Q1D); \ + const int l = dl(qx,dy,DPA_D1D); \ B[i][j] = b(qx,dy); \ G[k][l] = g(qx,dy) * sign(qx,dy); #define DIFFUSION3DPA_3 \ - double u[D1D], v[D1D]; \ + double u[DPA_D1D], v[DPA_D1D]; \ RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; dz++) { u[dz] = v[dz] = 0.0; } \ + for (int dz = 0; dz < DPA_D1D; dz++) { u[dz] = v[dz] = 0.0; } \ RAJA_UNROLL(MD1) \ - for (int dx = 0; dx < D1D; ++dx) \ + for (int dx = 0; dx < DPA_D1D; ++dx) \ { \ - const int i = qi(qx,dx,Q1D); \ - const int j = dj(qx,dx,D1D); \ - const int k = qk(qx,dx,Q1D); \ - const int l = dl(qx,dx,D1D); \ + const int i = qi(qx,dx,DPA_Q1D); \ + const int j = dj(qx,dx,DPA_D1D); \ + const int k = qk(qx,dx,DPA_Q1D); \ + const int l = dl(qx,dx,DPA_D1D); \ const double s = sign(qx,dx); \ RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; ++dz) \ + for (int dz = 0; dz < DPA_D1D; ++dz) \ { \ const double coords = s_X[dz][dy][dx]; \ u[dz] += coords * B[i][j]; \ @@ -428,26 +428,26 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) } \ } \ RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; ++dz) \ + for (int dz = 0; dz < DPA_D1D; ++dz) \ { \ DDQ0[dz][dy][qx] = u[dz]; \ DDQ1[dz][dy][qx] = v[dz]; \ } #define DIFFUSION3DPA_4 \ - double u[D1D], v[D1D], w[D1D]; \ + double u[DPA_D1D], v[DPA_D1D], w[DPA_D1D]; \ RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; dz++) { u[dz] = v[dz] = w[dz] = 0.0; } \ + for (int dz = 0; dz < DPA_D1D; dz++) { u[dz] = v[dz] = w[dz] = 0.0; } \ RAJA_UNROLL(MD1) \ - for (int dy = 0; dy < D1D; ++dy) \ + for (int dy = 0; dy < DPA_D1D; ++dy) \ { \ - const int i = qi(qy,dy,Q1D); \ - const int j = dj(qy,dy,D1D); \ - const int k = qk(qy,dy,Q1D); \ - const int l = dl(qy,dy,D1D); \ + const int i = qi(qy,dy,DPA_Q1D); \ + const int j = dj(qy,dy,DPA_D1D); \ + const int k = qk(qy,dy,DPA_Q1D); \ + const int l = dl(qy,dy,DPA_D1D); \ const double s = sign(qy,dy); \ RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; dz++) \ + for (int dz = 0; dz < DPA_D1D; dz++) \ { \ u[dz] += DDQ1[dz][dy][qx] * B[i][j]; \ v[dz] += DDQ0[dz][dy][qx] * G[k][l] * s; \ @@ -455,7 +455,7 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) } \ } \ RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; dz++) \ + for (int dz = 0; dz < DPA_D1D; dz++) \ { \ DQQ0[dz][qy][qx] = u[dz]; \ DQQ1[dz][qy][qx] = v[dz]; \ @@ -463,19 +463,19 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) } #define DIFFUSION3DPA_5 \ - double u[Q1D], v[Q1D], w[Q1D]; \ + double u[DPA_Q1D], v[DPA_Q1D], w[DPA_Q1D]; \ RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; qz++) { u[qz] = v[qz] = w[qz] = 0.0; } \ + for (int qz = 0; qz < DPA_Q1D; qz++) { u[qz] = v[qz] = w[qz] = 0.0; } \ RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; ++dz) \ + for (int dz = 0; dz < DPA_D1D; ++dz) \ { \ RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; qz++) \ + for (int qz = 0; qz < DPA_Q1D; qz++) \ { \ - const int i = qi(qz,dz,Q1D); \ - const int j = dj(qz,dz,D1D); \ - const int k = qk(qz,dz,Q1D); \ - const int l = dl(qz,dz,D1D); \ + const int i = qi(qz,dz,DPA_Q1D); \ + const int j = dj(qz,dz,DPA_D1D); \ + const int k = qk(qz,dz,DPA_Q1D); \ + const int l = dl(qz,dz,DPA_D1D); \ const double s = sign(qz,dz); \ u[qz] += DQQ0[dz][qy][qx] * B[i][j]; \ v[qz] += DQQ1[dz][qy][qx] * B[i][j]; \ @@ -483,7 +483,7 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) } \ } \ RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; qz++) \ + for (int qz = 0; qz < DPA_Q1D; qz++) \ { \ const double O11 = d(qx,qy,qz,0,e); \ const double O12 = d(qx,qy,qz,1,e); \ @@ -503,27 +503,27 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) } #define DIFFUSION3DPA_6 \ - const int i = qi(q,d,Q1D); \ - const int j = dj(q,d,D1D); \ - const int k = qk(q,d,Q1D); \ - const int l = dl(q,d,D1D); \ + const int i = qi(q,d,DPA_Q1D); \ + const int j = dj(q,d,DPA_D1D); \ + const int k = qk(q,d,DPA_Q1D); \ + const int l = dl(q,d,DPA_D1D); \ Bt[j][i] = b(q,d); \ Gt[l][k] = g(q,d) * sign(q,d); #define DIFFUSION3DPA_7 \ - double u[Q1D], v[Q1D], w[Q1D]; \ + double u[DPA_Q1D], v[DPA_Q1D], w[DPA_Q1D]; \ RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; ++qz) { u[qz] = v[qz] = w[qz] = 0.0; } \ + for (int qz = 0; qz < DPA_Q1D; ++qz) { u[qz] = v[qz] = w[qz] = 0.0; } \ RAJA_UNROLL(MQ1) \ - for (int qx = 0; qx < Q1D; ++qx) \ + for (int qx = 0; qx < DPA_Q1D; ++qx) \ { \ - const int i = qi(qx,dx,Q1D); \ - const int j = dj(qx,dx,D1D); \ - const int k = qk(qx,dx,Q1D); \ - const int l = dl(qx,dx,D1D); \ + const int i = qi(qx,dx,DPA_Q1D); \ + const int j = dj(qx,dx,DPA_D1D); \ + const int k = qk(qx,dx,DPA_Q1D); \ + const int l = dl(qx,dx,DPA_D1D); \ const double s = sign(qx,dx); \ RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; ++qz) \ + for (int qz = 0; qz < DPA_Q1D; ++qz) \ { \ u[qz] += QQQ0[qz][qy][qx] * Gt[l][k] * s; \ v[qz] += QQQ1[qz][qy][qx] * Bt[j][i]; \ @@ -531,7 +531,7 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) } \ } \ RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; ++qz) \ + for (int qz = 0; qz < DPA_Q1D; ++qz) \ { \ QQD0[qz][qy][dx] = u[qz]; \ QQD1[qz][qy][dx] = v[qz]; \ @@ -539,19 +539,19 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) } #define DIFFUSION3DPA_8 \ - double u[Q1D], v[Q1D], w[Q1D]; \ + double u[DPA_Q1D], v[DPA_Q1D], w[DPA_Q1D]; \ RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; ++qz) { u[qz] = v[qz] = w[qz] = 0.0; } \ + for (int qz = 0; qz < DPA_Q1D; ++qz) { u[qz] = v[qz] = w[qz] = 0.0; } \ RAJA_UNROLL(MQ1) \ - for (int qy = 0; qy < Q1D; ++qy) \ + for (int qy = 0; qy < DPA_Q1D; ++qy) \ { \ - const int i = qi(qy,dy,Q1D); \ - const int j = dj(qy,dy,D1D); \ - const int k = qk(qy,dy,Q1D); \ - const int l = dl(qy,dy,D1D); \ + const int i = qi(qy,dy,DPA_Q1D); \ + const int j = dj(qy,dy,DPA_D1D); \ + const int k = qk(qy,dy,DPA_Q1D); \ + const int l = dl(qy,dy,DPA_D1D); \ const double s = sign(qy,dy); \ RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; ++qz) \ + for (int qz = 0; qz < DPA_Q1D; ++qz) \ { \ u[qz] += QQD0[qz][qy][dx] * Bt[j][i]; \ v[qz] += QQD1[qz][qy][dx] * Gt[l][k] * s; \ @@ -559,7 +559,7 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) } \ } \ RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; ++qz) \ + for (int qz = 0; qz < DPA_Q1D; ++qz) \ { \ QDD0[qz][dy][dx] = u[qz]; \ QDD1[qz][dy][dx] = v[qz]; \ @@ -567,19 +567,19 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) } \ #define DIFFUSION3DPA_9 \ - double u[D1D], v[D1D], w[D1D]; \ + double u[DPA_D1D], v[DPA_D1D], w[DPA_D1D]; \ RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; ++dz) { u[dz] = v[dz] = w[dz] = 0.0; } \ + for (int dz = 0; dz < DPA_D1D; ++dz) { u[dz] = v[dz] = w[dz] = 0.0; } \ RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; ++qz) \ + for (int qz = 0; qz < DPA_Q1D; ++qz) \ { \ RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; ++dz) \ + for (int dz = 0; dz < DPA_D1D; ++dz) \ { \ - const int i = qi(qz,dz,Q1D); \ - const int j = dj(qz,dz,D1D); \ - const int k = qk(qz,dz,Q1D); \ - const int l = dl(qz,dz,D1D); \ + const int i = qi(qz,dz,DPA_Q1D); \ + const int j = dj(qz,dz,DPA_D1D); \ + const int k = qk(qz,dz,DPA_Q1D); \ + const int l = dl(qz,dz,DPA_D1D); \ const double s = sign(qz,dz); \ u[dz] += QDD0[qz][dy][dx] * Bt[j][i]; \ v[dz] += QDD1[qz][dy][dx] * Bt[j][i]; \ @@ -587,9 +587,9 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) } \ } \ RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; ++dz) \ + for (int dz = 0; dz < DPA_D1D; ++dz) \ { \ - Y_(dx,dy,dz,e) += (u[dz] + v[dz] + w[dz]); \ + dpaY_(dx,dy,dz,e) += (u[dz] + v[dz] + w[dz]); \ } #if defined(RAJA_ENABLE_CUDA) From 887a42bb4881aea227af46cd83a974f914351937 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 29 Sep 2021 11:49:50 -0700 Subject: [PATCH 120/392] add other Seq,OMP, cuda variants --- src/apps/DIFFUSION3DPA-Cuda.cpp | 198 +++++++++++++++++++++++++++++--- src/apps/DIFFUSION3DPA-OMP.cpp | 165 +++++++++++++++++++++++++- src/apps/DIFFUSION3DPA-Seq.cpp | 1 + 3 files changed, 348 insertions(+), 16 deletions(-) diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index a249162e4..4e41e5ac9 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -19,20 +19,20 @@ namespace rajaperf { namespace apps { -#define DIFFUSION3DPA_DATA_SETUP_CUDA \ - // allocAndInitCudaDeviceData(B, m_B, DPA_Q1D *DPA_D1D); \ - // allocAndInitCudaDeviceData(Bt, m_Bt, DPA_Q1D *DPA_D1D); \ - allocAndInitCudaDeviceData(D, m_D, DPA_Q1D *DPA_Q1D *DPA_Q1D *m_NE); \ - allocAndInitCudaDeviceData(X, m_X, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ +#define DIFFUSION3DPA_DATA_SETUP_CUDA \ + allocAndInitCudaDeviceData(Basis, m_B, DPA_Q1D *DPA_D1D); \ + allocAndInitCudaDeviceData(dBasis, m_G, DPA_Q1D *DPA_D1D); \ + allocAndInitCudaDeviceData(D, m_D, DPA_Q1D *DPA_Q1D *DPA_Q1D *SYM *m_NE); \ + allocAndInitCudaDeviceData(X, m_X, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ allocAndInitCudaDeviceData(Y, m_Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); -#define DIFFUSION3DPA_DATA_TEARDOWN_CUDA \ - // getCudaDeviceData(m_Y, Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ - // deallocCudaDeviceData(B); \ - // deallocCudaDeviceData(Bt); \ - // deallocCudaDeviceData(D); \ - // deallocCudaDeviceData(X); \ - //deallocCudaDeviceData(Y); +#define DIFFUSION3DPA_DATA_TEARDOWN_CUDA \ + getCudaDeviceData(m_Y, Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ + deallocCudaDeviceData(Basis); \ + deallocCudaDeviceData(dBasis); \ + deallocCudaDeviceData(D); \ + deallocCudaDeviceData(X); \ + deallocCudaDeviceData(Y); //#define USE_RAJA_UNROLL #define RAJA_DIRECT_PRAGMA(X) _Pragma(#X) @@ -44,11 +44,65 @@ namespace apps { #define FOREACH_THREAD(i, k, N) \ for (int i = threadIdx.k; i < N; i += blockDim.k) -__global__ void Diffusion3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt, - const Real_ptr D, const Real_ptr X, Real_ptr Y) { +__global__ void Diffusion3DPA(Index_type NE, const Real_ptr Basis, const Real_ptr dBasis, + const Real_ptr D, const Real_ptr X, Real_ptr Y, bool symmetric) { const int e = blockIdx.x; + DIFFUSION3DPA_0_GPU; + + FOREACH_THREAD(dy, y, DPA_D1D) { + FOREACH_THREAD(dx, x, DPA_D1D) { + DIFFUSION3DPA_1; + } + FOREACH_THREAD(qx, x, DPA_Q1D) { + DIFFUSION3DPA_2; + } + } + + __syncthreads(); + FOREACH_THREAD(dy, y, DPA_D1D) { + FOREACH_THREAD(qx, x, DPA_Q1D) { + DIFFUSION3DPA_3; + } + } + __syncthreads(); + FOREACH_THREAD(qy, y, DPA_Q1D) { + FOREACH_THREAD(qx, x, DPA_Q1D) { + DIFFUSION3DPA_4; + } + } + __syncthreads(); + FOREACH_THREAD(qy, y, DPA_Q1D) { + FOREACH_THREAD(qx, x, DPA_Q1D) { + DIFFUSION3DPA_5; + } + } + __syncthreads(); + FOREACH_THREAD(d, y, DPA_D1D) { + FOREACH_THREAD(q, x, DPA_Q1D) { + DIFFUSION3DPA_6; + } + } + __syncthreads(); + FOREACH_THREAD(qy, y, DPA_Q1D) { + FOREACH_THREAD(dx, x, DPA_D1D) { + DIFFUSION3DPA_7; + } + } + __syncthreads(); + FOREACH_THREAD(dy, y, DPA_D1D) { + FOREACH_THREAD(dx, x, DPA_D1D) { + DIFFUSION3DPA_8; + } + } + __syncthreads(); + FOREACH_THREAD(dy, y, DPA_D1D) { + FOREACH_THREAD(dx, x, DPA_D1D) { + DIFFUSION3DPA_9; + } + } + } void DIFFUSION3DPA::runCudaVariant(VariantID vid) { @@ -67,7 +121,7 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, 1); - // Diffusion3DPA<<>>(NE, B, Bt, D, X, Y); + Diffusion3DPA<<>>(NE, Basis, dBasis, D, X, Y, symmetric); cudaErrchk( cudaGetLastError() ); } @@ -101,6 +155,120 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + RAJA::expt::launch( + RAJA::expt::DEVICE, + RAJA::expt::Grid(RAJA::expt::Teams(NE), + RAJA::expt::Threads(DPA_Q1D, DPA_Q1D, 1)), + [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), + [&](int e) { + + DIFFUSION3DPA_0_GPU; + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + DIFFUSION3DPA_1; + } + ); // RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + DIFFUSION3DPA_2; + } + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + DIFFUSION3DPA_3; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + DIFFUSION3DPA_4; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + DIFFUSION3DPA_5; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int d) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int q) { + DIFFUSION3DPA_6; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + DIFFUSION3DPA_7; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + DIFFUSION3DPA_8; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + DIFFUSION3DPA_9; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + } // lambda (e) + ); // RAJA::expt::loop + + } // outer lambda (ctx) + ); // RAJA::expt::launch } // loop over kernel reps stopTimer(); diff --git a/src/apps/DIFFUSION3DPA-OMP.cpp b/src/apps/DIFFUSION3DPA-OMP.cpp index d9100c18d..b27a0e3cf 100644 --- a/src/apps/DIFFUSION3DPA-OMP.cpp +++ b/src/apps/DIFFUSION3DPA-OMP.cpp @@ -41,6 +41,59 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { #pragma omp parallel for for (int e = 0; e < NE; ++e) { + DIFFUSION3DPA_0_CPU; + + FOREACH_THREAD(dy, y, DPA_D1D) { + FOREACH_THREAD(dx, x, DPA_D1D) { + DIFFUSION3DPA_1; + } + FOREACH_THREAD(qx, x, DPA_Q1D) { + DIFFUSION3DPA_2; + } + } + + + FOREACH_THREAD(dy, y, DPA_D1D) { + FOREACH_THREAD(qx, x, DPA_Q1D) { + DIFFUSION3DPA_3; + } + } + + FOREACH_THREAD(qy, y, DPA_Q1D) { + FOREACH_THREAD(qx, x, DPA_Q1D) { + DIFFUSION3DPA_4; + } + } + + FOREACH_THREAD(qy, y, DPA_Q1D) { + FOREACH_THREAD(qx, x, DPA_Q1D) { + DIFFUSION3DPA_5; + } + } + + FOREACH_THREAD(d, y, DPA_D1D) { + FOREACH_THREAD(q, x, DPA_Q1D) { + DIFFUSION3DPA_6; + } + } + + FOREACH_THREAD(qy, y, DPA_Q1D) { + FOREACH_THREAD(dx, x, DPA_D1D) { + DIFFUSION3DPA_7; + } + } + + FOREACH_THREAD(dy, y, DPA_D1D) { + FOREACH_THREAD(dx, x, DPA_D1D) { + DIFFUSION3DPA_8; + } + } + + FOREACH_THREAD(dy, y, DPA_D1D) { + FOREACH_THREAD(dx, x, DPA_D1D) { + DIFFUSION3DPA_9; + } + } } // element loop } @@ -80,8 +133,118 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { for (RepIndex_type irep = 0; irep < run_reps; ++irep) { //Grid is empty as the host does not need a compute grid to be specified + RAJA::expt::launch( + RAJA::expt::HOST, RAJA::expt::Grid(), + [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), + [&](int e) { + + DIFFUSION3DPA_0_CPU; + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + DIFFUSION3DPA_1; + } + ); // RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + DIFFUSION3DPA_2; + } + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + + ctx.teamSync(); + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + DIFFUSION3DPA_3; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + DIFFUSION3DPA_4; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + DIFFUSION3DPA_5; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int d) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int q) { + DIFFUSION3DPA_6; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + DIFFUSION3DPA_7; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + DIFFUSION3DPA_8; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + DIFFUSION3DPA_9; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + } // lambda (e) + ); // RAJA::expt::loop + + } // outer lambda (ctx) + ); // RAJA::expt::launch } // loop over kernel reps stopTimer(); @@ -93,7 +256,7 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { << std::endl; } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index 968dbe806..d72328510 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -137,6 +137,7 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + //Grid is empty as the host does not need a compute grid to be specified RAJA::expt::launch( RAJA::expt::HOST, RAJA::expt::Grid(), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { From 341e54481a680e1d8dc578a97be0a8cdc31a47eb Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 29 Sep 2021 12:07:32 -0700 Subject: [PATCH 121/392] add hip version --- src/apps/DIFFUSION3DPA-Hip.cpp | 275 ++++++++++++++++----------------- 1 file changed, 136 insertions(+), 139 deletions(-) diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index 50c0daca8..d6f4324b9 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -19,17 +19,17 @@ namespace rajaperf { namespace apps { -#define DIFFUSION3DPA_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(B, m_B, Q1D *D1D); \ - allocAndInitHipDeviceData(Bt, m_Bt, Q1D *D1D); \ - allocAndInitHipDeviceData(D, m_D, Q1D *Q1D *Q1D *m_NE); \ - allocAndInitHipDeviceData(X, m_X, D1D *D1D *D1D *m_NE); \ - allocAndInitHipDeviceData(Y, m_Y, D1D *D1D *D1D *m_NE); - -#define DIFFUSION3DPA_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_Y, Y, D1D *D1D *D1D *m_NE); \ - deallocHipDeviceData(B); \ - deallocHipDeviceData(Bt); \ +#define DIFFUSION3DPA_DATA_SETUP_HIP \ + allocAndInitHipDeviceData(Basis, m_B, DPA_Q1D *DPA_D1D); \ + allocAndInitHipDeviceData(dBasis, m_G, DPA_Q1D *DPA_D1D); \ + allocAndInitHipDeviceData(D, m_D, DPA_Q1D *DPA_Q1D *DPA_Q1D *SYM *m_NE); \ + allocAndInitHipDeviceData(X, m_X, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ + allocAndInitHipDeviceData(Y, m_Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); + +#define DIFFUSION3DPA_DATA_TEARDOWN_HIP \ + getHipDeviceData(m_Y, Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ + deallocHipDeviceData(Basis); \ + deallocHipDeviceData(dBasis); \ deallocHipDeviceData(D); \ deallocHipDeviceData(X); \ deallocHipDeviceData(Y); @@ -44,67 +44,65 @@ namespace apps { #define FOREACH_THREAD(i, k, N) \ for (int i = threadIdx.k; i < N; i += blockDim.k) -__global__ void Diffusion3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt, - const Real_ptr D, const Real_ptr X, Real_ptr Y) { +__global__ void Diffusion3DPA(Index_type NE, const Real_ptr Basis, const Real_ptr dBasis, + const Real_ptr D, const Real_ptr X, Real_ptr Y, bool symmetric) { const int e = hipBlockIdx_x; - DIFFUSION3DPA_0_GPU + DIFFUSION3DPA_0_GPU; - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D){ - DIFFUSION3DPA_1 + FOREACH_THREAD(dy, y, DPA_D1D) { + FOREACH_THREAD(dx, x, DPA_D1D) { + DIFFUSION3DPA_1; } - FOREACH_THREAD(dx, x, Q1D) { - DIFFUSION3DPA_2 + FOREACH_THREAD(qx, x, DPA_Q1D) { + DIFFUSION3DPA_2; } } + __syncthreads(); - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(qx, x, Q1D) { - DIFFUSION3DPA_3 + FOREACH_THREAD(dy, y, DPA_D1D) { + FOREACH_THREAD(qx, x, DPA_Q1D) { + DIFFUSION3DPA_3; } } __syncthreads(); - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(qx, x, Q1D) { - DIFFUSION3DPA_4 + FOREACH_THREAD(qy, y, DPA_Q1D) { + FOREACH_THREAD(qx, x, DPA_Q1D) { + DIFFUSION3DPA_4; } } __syncthreads(); - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(qx, x, Q1D) { - DIFFUSION3DPA_5 + FOREACH_THREAD(qy, y, DPA_Q1D) { + FOREACH_THREAD(qx, x, DPA_Q1D) { + DIFFUSION3DPA_5; } } - __syncthreads(); - FOREACH_THREAD(d, y, D1D) { - FOREACH_THREAD(q, x, Q1D) { - DIFFUSION3DPA_6 + FOREACH_THREAD(d, y, DPA_D1D) { + FOREACH_THREAD(q, x, DPA_Q1D) { + DIFFUSION3DPA_6; } } - __syncthreads(); - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(dx, x, D1D) { - DIFFUSION3DPA_7 + FOREACH_THREAD(qy, y, DPA_Q1D) { + FOREACH_THREAD(dx, x, DPA_D1D) { + DIFFUSION3DPA_7; } } __syncthreads(); - - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D) { - DIFFUSION3DPA_8 + FOREACH_THREAD(dy, y, DPA_D1D) { + FOREACH_THREAD(dx, x, DPA_D1D) { + DIFFUSION3DPA_8; } } - __syncthreads(); - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D) { - DIFFUSION3DPA_9 + FOREACH_THREAD(dy, y, DPA_D1D) { + FOREACH_THREAD(dx, x, DPA_D1D) { + DIFFUSION3DPA_9; } } + } void DIFFUSION3DPA::runHipVariant(VariantID vid) { @@ -119,13 +117,13 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { DIFFUSION3DPA_DATA_SETUP_HIP; dim3 grid_size(NE); - dim3 block_size(Q1D, Q1D, 1); + dim3 block_size(DPA_Q1D, DPA_Q1D, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { hipLaunchKernelGGL((Diffusion3DPA), dim3(grid_size), dim3(block_size), 0, 0, - NE, B, Bt, D, X, Y); + NE, Basis, dBasis, D, X, Y, symmetric); hipErrchk( hipGetLastError() ); @@ -163,118 +161,117 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { RAJA::expt::launch( RAJA::expt::DEVICE, RAJA::expt::Grid(RAJA::expt::Teams(NE), - RAJA::expt::Threads(Q1D, Q1D, 1)), + RAJA::expt::Threads(DPA_Q1D, DPA_Q1D, 1)), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), [&](int e) { - DIFFUSION3DPA_0_GPU + DIFFUSION3DPA_0_GPU; - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { - DIFFUSION3DPA_1 - } - ); // RAJA::expt::loop - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int dx) { - DIFFUSION3DPA_2 + DIFFUSION3DPA_1; } ); // RAJA::expt::loop - } // lambda (dy) - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int qx) { - DIFFUSION3DPA_3 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int qx) { - DIFFUSION3DPA_4 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qx) { - DIFFUSION3DPA_5 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int d) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int q) { - DIFFUSION3DPA_6 + DIFFUSION3DPA_2; } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dx) { - DIFFUSION3DPA_7 - } - ); // RAJA::expt::loop - - ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), - [&](int dx) { - DIFFUSION3DPA_8 - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + DIFFUSION3DPA_3; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + DIFFUSION3DPA_4; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + DIFFUSION3DPA_5; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int d) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int q) { + DIFFUSION3DPA_6; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + DIFFUSION3DPA_7; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + DIFFUSION3DPA_8; + } + ); // RAJA::expt::loop + } + ); // RAJA::expt::loop ctx.teamSync(); - - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { - DIFFUSION3DPA_9 + DIFFUSION3DPA_9; } - ); // RAJA::expt::loop + ); // RAJA::expt::loop } - ); // RAJA::expt::loop + ); // RAJA::expt::loop - } // lambda (e) - ); // RAJA::expt::loop + } // lambda (e) + ); // RAJA::expt::loop - } // outer lambda (ctx) - ); // RAJA::expt::launch + } // outer lambda (ctx) + ); // RAJA::expt::launch } // loop over kernel reps stopTimer(); From d951a51ebeea40f566f1e76e8f774d3aa67b641e Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 29 Sep 2021 13:19:24 -0700 Subject: [PATCH 122/392] update flop count --- src/apps/DIFFUSION3DPA.cpp | 21 +++++++++++++-------- src/apps/DIFFUSION3DPA.hpp | 5 ++--- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp index 6f046cc20..acb7ddb5d 100644 --- a/src/apps/DIFFUSION3DPA.cpp +++ b/src/apps/DIFFUSION3DPA.cpp @@ -35,17 +35,22 @@ DIFFUSION3DPA::DIFFUSION3DPA(const RunParams& params) setItsPerRep(getActualProblemSize()); setKernelsPerRep(1); - setBytesPerRep( 4*DPA_Q1D*DPA_D1D*sizeof(Real_type) + - DPA_Q1D*DPA_Q1D*DPA_Q1D*m_NE*sizeof(Real_type) + + setBytesPerRep( 2*DPA_Q1D*DPA_D1D*sizeof(Real_type) + + DPA_Q1D*DPA_Q1D*DPA_Q1D*SYM*m_NE*sizeof(Real_type) + DPA_D1D*DPA_D1D*DPA_D1D*m_NE*sizeof(Real_type) + DPA_D1D*DPA_D1D*DPA_D1D*m_NE*sizeof(Real_type) ); - setFLOPsPerRep(m_NE * (2 * DPA_D1D * DPA_D1D * DPA_D1D * DPA_Q1D + - 2 * DPA_D1D * DPA_D1D * DPA_Q1D * DPA_Q1D + - 2 * DPA_D1D * DPA_Q1D * DPA_Q1D * DPA_Q1D + DPA_Q1D * DPA_Q1D * DPA_Q1D + - 2 * DPA_Q1D * DPA_Q1D * DPA_Q1D * DPA_D1D + - 2 * DPA_Q1D * DPA_Q1D * DPA_D1D * DPA_D1D + - 2 * DPA_Q1D * DPA_D1D * DPA_D1D * DPA_D1D + DPA_D1D * DPA_D1D * DPA_D1D)); + setFLOPsPerRep(m_NE * (DPA_Q1D * DPA_D1D + + 5 * DPA_D1D * DPA_D1D * DPA_Q1D * DPA_D1D + + 7 * DPA_D1D * DPA_D1D * DPA_Q1D * DPA_Q1D + + 7 * DPA_Q1D * DPA_D1D * DPA_Q1D * DPA_Q1D + + 15 * DPA_Q1D * DPA_Q1D * DPA_Q1D + + DPA_Q1D * DPA_D1D + + 7 * DPA_Q1D * DPA_Q1D * DPA_D1D * DPA_Q1D + + 7 * DPA_Q1D * DPA_Q1D * DPA_D1D * DPA_D1D + + 7 * DPA_D1D * DPA_Q1D * DPA_D1D * DPA_D1D + + 3 * DPA_D1D * DPA_D1D * DPA_D1D)); + setUsesFeature(Teams); setVariantDefined( Base_Seq ); diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index 183593c04..c588a7c22 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -47,7 +47,7 @@ /// { /// X[dz][dy][dx] = x(dx,dy,dz,e); /// } -/// } +/// } /// for(int qx=0; qx Date: Wed, 29 Sep 2021 13:22:11 -0700 Subject: [PATCH 123/392] clean up pass --- src/apps/DIFFUSION3DPA-Seq.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index d72328510..0311ae284 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -24,9 +24,6 @@ namespace apps { #endif #define FOREACH_THREAD(i, k, N) for (int i = 0; i < N; i++) -#define MFEM_SHARED -#define MFEM_SYNC_THREAD - void DIFFUSION3DPA::runSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -52,43 +49,43 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { } } - MFEM_SYNC_THREAD; + FOREACH_THREAD(dy, y, DPA_D1D) { FOREACH_THREAD(qx, x, DPA_Q1D) { DIFFUSION3DPA_3; } } - MFEM_SYNC_THREAD; + FOREACH_THREAD(qy, y, DPA_Q1D) { FOREACH_THREAD(qx, x, DPA_Q1D) { DIFFUSION3DPA_4; } } - MFEM_SYNC_THREAD; + FOREACH_THREAD(qy, y, DPA_Q1D) { FOREACH_THREAD(qx, x, DPA_Q1D) { DIFFUSION3DPA_5; } } - MFEM_SYNC_THREAD; + FOREACH_THREAD(d, y, DPA_D1D) { FOREACH_THREAD(q, x, DPA_Q1D) { DIFFUSION3DPA_6; } } - MFEM_SYNC_THREAD; + FOREACH_THREAD(qy, y, DPA_Q1D) { FOREACH_THREAD(dx, x, DPA_D1D) { DIFFUSION3DPA_7; } } - MFEM_SYNC_THREAD; + FOREACH_THREAD(dy, y, DPA_D1D) { FOREACH_THREAD(dx, x, DPA_D1D) { DIFFUSION3DPA_8; } } - MFEM_SYNC_THREAD; + FOREACH_THREAD(dy, y, DPA_D1D) { FOREACH_THREAD(dx, x, DPA_D1D) { DIFFUSION3DPA_9; From 8c6d121134e90a0b6e912b715b327308aa167eb4 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 29 Sep 2021 13:24:20 -0700 Subject: [PATCH 124/392] clean up pass --- src/apps/DIFFUSION3DPA-OMP.cpp | 8 ++++---- src/apps/DIFFUSION3DPA-Seq.cpp | 8 ++++---- src/apps/DIFFUSION3DPA.hpp | 16 ++++++++-------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/apps/DIFFUSION3DPA-OMP.cpp b/src/apps/DIFFUSION3DPA-OMP.cpp index b27a0e3cf..188d8fbda 100644 --- a/src/apps/DIFFUSION3DPA-OMP.cpp +++ b/src/apps/DIFFUSION3DPA-OMP.cpp @@ -107,25 +107,25 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { //Currently Teams requires two policies if compiled with a device using launch_policy = RAJA::expt::LaunchPolicy; using outer_x = RAJA::expt::LoopPolicy; using inner_x = RAJA::expt::LoopPolicy; using inner_y = RAJA::expt::LoopPolicy; diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index 0311ae284..c116b8dd1 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -106,28 +106,28 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { using launch_policy = RAJA::expt::LaunchPolicy; using outer_x = RAJA::expt::LoopPolicy; using inner_x = RAJA::expt::LoopPolicy; using inner_y = RAJA::expt::LoopPolicy; diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index c588a7c22..ab3e5d1d8 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -592,17 +592,17 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) } #if defined(RAJA_ENABLE_CUDA) - using m3d_device_launch = RAJA::expt::cuda_launch_t; - using m3d_gpu_block_x_policy = RAJA::cuda_block_x_direct; - using m3d_gpu_thread_x_policy = RAJA::cuda_thread_x_loop; - using m3d_gpu_thread_y_policy = RAJA::cuda_thread_y_loop; + using d3d_device_launch = RAJA::expt::cuda_launch_t; + using d3d_gpu_block_x_policy = RAJA::cuda_block_x_direct; + using d3d_gpu_thread_x_policy = RAJA::cuda_thread_x_loop; + using d3d_gpu_thread_y_policy = RAJA::cuda_thread_y_loop; #endif #if defined(RAJA_ENABLE_HIP) - using m3d_device_launch = RAJA::expt::hip_launch_t; - using m3d_gpu_block_x_policy = RAJA::hip_block_x_direct; - using m3d_gpu_thread_x_policy = RAJA::hip_thread_x_loop; - using m3d_gpu_thread_y_policy = RAJA::hip_thread_y_loop; + using d3d_device_launch = RAJA::expt::hip_launch_t; + using d3d_gpu_block_x_policy = RAJA::hip_block_x_direct; + using d3d_gpu_thread_x_policy = RAJA::hip_thread_x_loop; + using d3d_gpu_thread_y_policy = RAJA::hip_thread_y_loop; #endif namespace rajaperf From 91aa6bc5b8f8412e4af9d2dfda04222206cc1b5e Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Wed, 29 Sep 2021 16:57:07 -0700 Subject: [PATCH 125/392] fixing how i pull for the tpl/RAJA submod --- scripts/gitlab/build_and_test.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 7d0095883..511244806 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -117,8 +117,6 @@ then then cd tpl/RAJA git pull origin develop - git checkout "task/kab163/set-up-multi-project-ci" - git pull cd - fi From 6c366bde6219ec6d2cd937faf6df0376cea9a7c8 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 30 Sep 2021 10:15:26 -0700 Subject: [PATCH 126/392] re-org of FEM macros --- src/apps/DIFFUSION3DPA-Cuda.cpp | 9 +- src/apps/DIFFUSION3DPA-Hip.cpp | 17 +-- src/apps/DIFFUSION3DPA-OMP.cpp | 8 +- src/apps/DIFFUSION3DPA-Seq.cpp | 8 +- src/apps/DIFFUSION3DPA.cpp | 2 +- src/apps/DIFFUSION3DPA.hpp | 3 +- src/apps/FEM_MACROS.hpp | 25 ++++ src/apps/MASS3DPA-Cuda.cpp | 95 +++++++-------- src/apps/MASS3DPA-Hip.cpp | 93 +++++++-------- src/apps/MASS3DPA-OMP.cpp | 76 ++++++------ src/apps/MASS3DPA-Seq.cpp | 76 ++++++------ src/apps/MASS3DPA.cpp | 42 +++---- src/apps/MASS3DPA.hpp | 199 ++++++++++++++++---------------- 13 files changed, 314 insertions(+), 339 deletions(-) create mode 100644 src/apps/FEM_MACROS.hpp diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index 4e41e5ac9..8aa12d5c4 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -34,15 +34,8 @@ namespace apps { deallocCudaDeviceData(X); \ deallocCudaDeviceData(Y); +// Uncomment to add compiler directives for loop unrolling //#define USE_RAJA_UNROLL -#define RAJA_DIRECT_PRAGMA(X) _Pragma(#X) -#if defined(USE_RAJA_UNROLL) -#define RAJA_UNROLL(N) RAJA_DIRECT_PRAGMA(unroll(N)) -#else -#define RAJA_UNROLL(N) -#endif -#define FOREACH_THREAD(i, k, N) \ - for (int i = threadIdx.k; i < N; i += blockDim.k) __global__ void Diffusion3DPA(Index_type NE, const Real_ptr Basis, const Real_ptr dBasis, const Real_ptr D, const Real_ptr X, Real_ptr Y, bool symmetric) { diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index d6f4324b9..1a3060ac0 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -19,11 +19,11 @@ namespace rajaperf { namespace apps { -#define DIFFUSION3DPA_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(Basis, m_B, DPA_Q1D *DPA_D1D); \ - allocAndInitHipDeviceData(dBasis, m_G, DPA_Q1D *DPA_D1D); \ +#define DIFFUSION3DPA_DATA_SETUP_HIP \ + allocAndInitHipDeviceData(Basis, m_B, DPA_Q1D *DPA_D1D); \ + allocAndInitHipDeviceData(dBasis, m_G, DPA_Q1D *DPA_D1D); \ allocAndInitHipDeviceData(D, m_D, DPA_Q1D *DPA_Q1D *DPA_Q1D *SYM *m_NE); \ - allocAndInitHipDeviceData(X, m_X, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ + allocAndInitHipDeviceData(X, m_X, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ allocAndInitHipDeviceData(Y, m_Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); #define DIFFUSION3DPA_DATA_TEARDOWN_HIP \ @@ -34,15 +34,8 @@ namespace apps { deallocHipDeviceData(X); \ deallocHipDeviceData(Y); +// Uncomment to add compiler directives for loop unrolling //#define USE_RAJA_UNROLL -#define RAJA_DIRECT_PRAGMA(X) _Pragma(#X) -#if defined(USE_RAJA_UNROLL) -#define RAJA_UNROLL(N) RAJA_DIRECT_PRAGMA(unroll(N)) -#else -#define RAJA_UNROLL(N) -#endif -#define FOREACH_THREAD(i, k, N) \ - for (int i = threadIdx.k; i < N; i += blockDim.k) __global__ void Diffusion3DPA(Index_type NE, const Real_ptr Basis, const Real_ptr dBasis, const Real_ptr D, const Real_ptr X, Real_ptr Y, bool symmetric) { diff --git a/src/apps/DIFFUSION3DPA-OMP.cpp b/src/apps/DIFFUSION3DPA-OMP.cpp index 188d8fbda..02a10c450 100644 --- a/src/apps/DIFFUSION3DPA-OMP.cpp +++ b/src/apps/DIFFUSION3DPA-OMP.cpp @@ -15,14 +15,8 @@ namespace rajaperf { namespace apps { +// Uncomment to add compiler directives for loop unrolling //#define USE_RAJA_UNROLL -#define RAJA_DIRECT_PRAGMA(X) _Pragma(#X) -#if defined(USE_RAJA_UNROLL) -#define RAJA_UNROLL(N) RAJA_DIRECT_PRAGMA(unroll(N)) -#else -#define RAJA_UNROLL(N) -#endif -#define FOREACH_THREAD(i, k, N) for (int i = 0; i < N; i++) void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index c116b8dd1..35ecd74bc 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -15,14 +15,8 @@ namespace rajaperf { namespace apps { +// Uncomment to add compiler directives for loop unrolling //#define USE_RAJA_UNROLL -#define RAJA_DIRECT_PRAGMA(X) _Pragma(#X) -#if defined(USE_RAJA_UNROLL) -#define RAJA_UNROLL(N) RAJA_DIRECT_PRAGMA(unroll(N)) -#else -#define RAJA_UNROLL(N) -#endif -#define FOREACH_THREAD(i, k, N) for (int i = 0; i < N; i++) void DIFFUSION3DPA::runSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp index acb7ddb5d..baf892576 100644 --- a/src/apps/DIFFUSION3DPA.cpp +++ b/src/apps/DIFFUSION3DPA.cpp @@ -23,7 +23,7 @@ namespace apps DIFFUSION3DPA::DIFFUSION3DPA(const RunParams& params) : KernelBase(rajaperf::Apps_DIFFUSION3DPA, params) { - m_NE_default = 8000; + m_NE_default = 15625; setDefaultProblemSize(m_NE_default*DPA_Q1D*DPA_Q1D*DPA_Q1D); setDefaultReps(50); diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index ab3e5d1d8..b7a2b1e1d 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -7,7 +7,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// /// -/// Action of 3D Mass matrix via partial assembly +/// Action of 3D diffusion matrix via partial assembly /// /// Based on MFEM's/CEED algorithms. /// Reference implementation @@ -293,6 +293,7 @@ Index_type NE = m_NE; \ const bool symmetric = true; #include "common/KernelBase.hpp" +#include "FEM_MACROS.hpp" #include "RAJA/RAJA.hpp" diff --git a/src/apps/FEM_MACROS.hpp b/src/apps/FEM_MACROS.hpp new file mode 100644 index 000000000..bbf19b02a --- /dev/null +++ b/src/apps/FEM_MACROS.hpp @@ -0,0 +1,25 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +#ifndef RAJAPerf_FEM_MACROS_HPP +#define RAJAPerf_FEM_MACROS_HPP + +#define RAJA_DIRECT_PRAGMA(X) _Pragma(#X) +#if defined(USE_RAJA_UNROLL) +#define RAJA_UNROLL(N) RAJA_DIRECT_PRAGMA(unroll(N)) +#else +#define RAJA_UNROLL(N) +#endif + +#if defined(RAJA_DEVICE_CODE) +#define FOREACH_THREAD(i, k, N) \ + for (int i = threadIdx.k; i < N; i += blockDim.k) +#else +#define FOREACH_THREAD(i, k, N) for (int i = 0; i < N; i++) +#endif + +#endif // closing endif for header file include guard diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp index 0de50ee15..7353ee1f1 100644 --- a/src/apps/MASS3DPA-Cuda.cpp +++ b/src/apps/MASS3DPA-Cuda.cpp @@ -19,30 +19,23 @@ namespace rajaperf { namespace apps { -#define MASS3DPA_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(B, m_B, Q1D *D1D); \ - allocAndInitCudaDeviceData(Bt, m_Bt, Q1D *D1D); \ - allocAndInitCudaDeviceData(D, m_D, Q1D *Q1D *Q1D *m_NE); \ - allocAndInitCudaDeviceData(X, m_X, D1D *D1D *D1D *m_NE); \ - allocAndInitCudaDeviceData(Y, m_Y, D1D *D1D *D1D *m_NE); +#define MASS3DPA_DATA_SETUP_CUDA \ + allocAndInitCudaDeviceData(B, m_B, MPA_Q1D *MPA_D1D); \ + allocAndInitCudaDeviceData(Bt, m_Bt, MPA_Q1D *MPA_D1D); \ + allocAndInitCudaDeviceData(D, m_D, MPA_Q1D *MPA_Q1D *MPA_Q1D *m_NE); \ + allocAndInitCudaDeviceData(X, m_X, MPA_D1D *MPA_D1D *MPA_D1D *m_NE); \ + allocAndInitCudaDeviceData(Y, m_Y, MPA_D1D *MPA_D1D *MPA_D1D *m_NE); #define MASS3DPA_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_Y, Y, D1D *D1D *D1D *m_NE); \ + getCudaDeviceData(m_Y, Y, MPA_D1D *MPA_D1D *MPA_D1D *m_NE); \ deallocCudaDeviceData(B); \ deallocCudaDeviceData(Bt); \ deallocCudaDeviceData(D); \ deallocCudaDeviceData(X); \ deallocCudaDeviceData(Y); +// Uncomment to add compiler directives loop unrolling //#define USE_RAJA_UNROLL -#define RAJA_DIRECT_PRAGMA(X) _Pragma(#X) -#if defined(USE_RAJA_UNROLL) -#define RAJA_UNROLL(N) RAJA_DIRECT_PRAGMA(unroll(N)) -#else -#define RAJA_UNROLL(N) -#endif -#define FOREACH_THREAD(i, k, N) \ - for (int i = threadIdx.k; i < N; i += blockDim.k) __global__ void Mass3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt, const Real_ptr D, const Real_ptr X, Real_ptr Y) { @@ -51,57 +44,57 @@ __global__ void Mass3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt, MASS3DPA_0_GPU - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D){ + FOREACH_THREAD(dy, y, MPA_D1D) { + FOREACH_THREAD(dx, x, MPA_D1D){ MASS3DPA_1 } - FOREACH_THREAD(dx, x, Q1D) { + FOREACH_THREAD(dx, x, MPA_Q1D) { MASS3DPA_2 } } __syncthreads(); - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(qx, x, Q1D) { + FOREACH_THREAD(dy, y, MPA_D1D) { + FOREACH_THREAD(qx, x, MPA_Q1D) { MASS3DPA_3 } } __syncthreads(); - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(qx, x, Q1D) { + FOREACH_THREAD(qy, y, MPA_Q1D) { + FOREACH_THREAD(qx, x, MPA_Q1D) { MASS3DPA_4 } } __syncthreads(); - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(qx, x, Q1D) { + FOREACH_THREAD(qy, y, MPA_Q1D) { + FOREACH_THREAD(qx, x, MPA_Q1D) { MASS3DPA_5 } } __syncthreads(); - FOREACH_THREAD(d, y, D1D) { - FOREACH_THREAD(q, x, Q1D) { + FOREACH_THREAD(d, y, MPA_D1D) { + FOREACH_THREAD(q, x, MPA_Q1D) { MASS3DPA_6 } } __syncthreads(); - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(dx, x, D1D) { + FOREACH_THREAD(qy, y, MPA_Q1D) { + FOREACH_THREAD(dx, x, MPA_D1D) { MASS3DPA_7 } } __syncthreads(); - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D) { + FOREACH_THREAD(dy, y, MPA_D1D) { + FOREACH_THREAD(dx, x, MPA_D1D) { MASS3DPA_8 } } __syncthreads(); - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D) { + FOREACH_THREAD(dy, y, MPA_D1D) { + FOREACH_THREAD(dx, x, MPA_D1D) { MASS3DPA_9 } } @@ -121,7 +114,7 @@ void MASS3DPA::runCudaVariant(VariantID vid) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - dim3 nthreads_per_block(Q1D, Q1D, 1); + dim3 nthreads_per_block(MPA_Q1D, MPA_Q1D, 1); Mass3DPA<<>>(NE, B, Bt, D, X, Y); @@ -160,7 +153,7 @@ void MASS3DPA::runCudaVariant(VariantID vid) { RAJA::expt::launch( RAJA::expt::DEVICE, RAJA::expt::Grid(RAJA::expt::Teams(NE), - RAJA::expt::Threads(Q1D, Q1D, 1)), + RAJA::expt::Threads(MPA_Q1D, MPA_Q1D, 1)), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), @@ -168,15 +161,15 @@ void MASS3DPA::runCudaVariant(VariantID vid) { MASS3DPA_0_GPU - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dx) { MASS3DPA_1 } ); // RAJA::expt::loop - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int dx) { MASS3DPA_2 } @@ -186,9 +179,9 @@ void MASS3DPA::runCudaVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qx) { MASS3DPA_3 } @@ -198,9 +191,9 @@ void MASS3DPA::runCudaVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qx) { MASS3DPA_4 } @@ -210,9 +203,9 @@ void MASS3DPA::runCudaVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qx) { MASS3DPA_5 } @@ -222,9 +215,9 @@ void MASS3DPA::runCudaVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int d) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int q) { MASS3DPA_6 } @@ -234,9 +227,9 @@ void MASS3DPA::runCudaVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dx) { MASS3DPA_7 } @@ -246,9 +239,9 @@ void MASS3DPA::runCudaVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dx) { MASS3DPA_8 } @@ -258,9 +251,9 @@ void MASS3DPA::runCudaVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dx) { MASS3DPA_9 } diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index 015b9ea5e..07cbb29b9 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -20,29 +20,22 @@ namespace rajaperf { namespace apps { #define MASS3DPA_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(B, m_B, Q1D *D1D); \ - allocAndInitHipDeviceData(Bt, m_Bt, Q1D *D1D); \ - allocAndInitHipDeviceData(D, m_D, Q1D *Q1D *Q1D *m_NE); \ - allocAndInitHipDeviceData(X, m_X, D1D *D1D *D1D *m_NE); \ - allocAndInitHipDeviceData(Y, m_Y, D1D *D1D *D1D *m_NE); + allocAndInitHipDeviceData(B, m_B, MPA_Q1D *MPA_D1D); \ + allocAndInitHipDeviceData(Bt, m_Bt, MPA_Q1D *MPA_D1D); \ + allocAndInitHipDeviceData(D, m_D, MPA_Q1D *MPA_Q1D *MPA_Q1D *m_NE); \ + allocAndInitHipDeviceData(X, m_X, MPA_D1D *MPA_D1D *MPA_D1D *m_NE); \ + allocAndInitHipDeviceData(Y, m_Y, MPA_D1D *MPA_D1D *MPA_D1D *m_NE); #define MASS3DPA_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_Y, Y, D1D *D1D *D1D *m_NE); \ + getHipDeviceData(m_Y, Y, MPA_D1D *MPA_D1D *MPA_D1D *m_NE); \ deallocHipDeviceData(B); \ deallocHipDeviceData(Bt); \ deallocHipDeviceData(D); \ deallocHipDeviceData(X); \ deallocHipDeviceData(Y); +// Uncomment to add compiler directives loop unrolling //#define USE_RAJA_UNROLL -#define RAJA_DIRECT_PRAGMA(X) _Pragma(#X) -#if defined(USE_RAJA_UNROLL) -#define RAJA_UNROLL(N) RAJA_DIRECT_PRAGMA(unroll(N)) -#else -#define RAJA_UNROLL(N) -#endif -#define FOREACH_THREAD(i, k, N) \ - for (int i = threadIdx.k; i < N; i += blockDim.k) __global__ void Mass3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt, const Real_ptr D, const Real_ptr X, Real_ptr Y) { @@ -51,57 +44,57 @@ __global__ void Mass3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt, MASS3DPA_0_GPU - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D){ + FOREACH_THREAD(dy, y, MPA_D1D) { + FOREACH_THREAD(dx, x, MPA_D1D){ MASS3DPA_1 } - FOREACH_THREAD(dx, x, Q1D) { + FOREACH_THREAD(dx, x, MPA_Q1D) { MASS3DPA_2 } } __syncthreads(); - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(qx, x, Q1D) { + FOREACH_THREAD(dy, y, MPA_D1D) { + FOREACH_THREAD(qx, x, MPA_Q1D) { MASS3DPA_3 } } __syncthreads(); - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(qx, x, Q1D) { + FOREACH_THREAD(qy, y, MPA_Q1D) { + FOREACH_THREAD(qx, x, MPA_Q1D) { MASS3DPA_4 } } __syncthreads(); - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(qx, x, Q1D) { + FOREACH_THREAD(qy, y, MPA_Q1D) { + FOREACH_THREAD(qx, x, MPA_Q1D) { MASS3DPA_5 } } __syncthreads(); - FOREACH_THREAD(d, y, D1D) { - FOREACH_THREAD(q, x, Q1D) { + FOREACH_THREAD(d, y, MPA_D1D) { + FOREACH_THREAD(q, x, MPA_Q1D) { MASS3DPA_6 } } __syncthreads(); - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(dx, x, D1D) { + FOREACH_THREAD(qy, y, MPA_Q1D) { + FOREACH_THREAD(dx, x, MPA_D1D) { MASS3DPA_7 } } __syncthreads(); - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D) { + FOREACH_THREAD(dy, y, MPA_D1D) { + FOREACH_THREAD(dx, x, MPA_D1D) { MASS3DPA_8 } } __syncthreads(); - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D) { + FOREACH_THREAD(dy, y, MPA_D1D) { + FOREACH_THREAD(dx, x, MPA_D1D) { MASS3DPA_9 } } @@ -119,7 +112,7 @@ void MASS3DPA::runHipVariant(VariantID vid) { MASS3DPA_DATA_SETUP_HIP; dim3 grid_size(NE); - dim3 block_size(Q1D, Q1D, 1); + dim3 block_size(MPA_Q1D, MPA_Q1D, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -163,22 +156,22 @@ void MASS3DPA::runHipVariant(VariantID vid) { RAJA::expt::launch( RAJA::expt::DEVICE, RAJA::expt::Grid(RAJA::expt::Teams(NE), - RAJA::expt::Threads(Q1D, Q1D, 1)), + RAJA::expt::Threads(MPA_Q1D, MPA_Q1D, 1)), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), [&](int e) { MASS3DPA_0_GPU - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dx) { MASS3DPA_1 } ); // RAJA::expt::loop - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int dx) { MASS3DPA_2 } @@ -188,9 +181,9 @@ void MASS3DPA::runHipVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qx) { MASS3DPA_3 } @@ -200,9 +193,9 @@ void MASS3DPA::runHipVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qx) { MASS3DPA_4 } @@ -212,9 +205,9 @@ void MASS3DPA::runHipVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qx) { MASS3DPA_5 } @@ -224,9 +217,9 @@ void MASS3DPA::runHipVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int d) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int q) { MASS3DPA_6 } @@ -236,9 +229,9 @@ void MASS3DPA::runHipVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dx) { MASS3DPA_7 } @@ -248,9 +241,9 @@ void MASS3DPA::runHipVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dx) { MASS3DPA_8 } @@ -260,9 +253,9 @@ void MASS3DPA::runHipVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dx) { MASS3DPA_9 } diff --git a/src/apps/MASS3DPA-OMP.cpp b/src/apps/MASS3DPA-OMP.cpp index 2a7781fed..d69ef9427 100644 --- a/src/apps/MASS3DPA-OMP.cpp +++ b/src/apps/MASS3DPA-OMP.cpp @@ -15,14 +15,8 @@ namespace rajaperf { namespace apps { +// Uncomment to add compiler directives for loop unrolling //#define USE_RAJA_UNROLL -#define RAJA_DIRECT_PRAGMA(X) _Pragma(#X) -#if defined(USE_RAJA_UNROLL) -#define RAJA_UNROLL(N) RAJA_DIRECT_PRAGMA(unroll(N)) -#else -#define RAJA_UNROLL(N) -#endif -#define FOREACH_THREAD(i, k, N) for (int i = 0; i < N; i++) void MASS3DPA::runOpenMPVariant(VariantID vid) { @@ -43,53 +37,53 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { MASS3DPA_0_CPU - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D){ + FOREACH_THREAD(dy, y, MPA_D1D) { + FOREACH_THREAD(dx, x, MPA_D1D){ MASS3DPA_1 } - FOREACH_THREAD(dx, x, Q1D) { + FOREACH_THREAD(dx, x, MPA_Q1D) { MASS3DPA_2 } } - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(qx, x, Q1D) { + FOREACH_THREAD(dy, y, MPA_D1D) { + FOREACH_THREAD(qx, x, MPA_Q1D) { MASS3DPA_3 } } - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(qx, x, Q1D) { + FOREACH_THREAD(qy, y, MPA_Q1D) { + FOREACH_THREAD(qx, x, MPA_Q1D) { MASS3DPA_4 } } - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(qx, x, Q1D) { + FOREACH_THREAD(qy, y, MPA_Q1D) { + FOREACH_THREAD(qx, x, MPA_Q1D) { MASS3DPA_5 } } - FOREACH_THREAD(d, y, D1D) { - FOREACH_THREAD(q, x, Q1D) { + FOREACH_THREAD(d, y, MPA_D1D) { + FOREACH_THREAD(q, x, MPA_Q1D) { MASS3DPA_6 } } - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(dx, x, D1D) { + FOREACH_THREAD(qy, y, MPA_Q1D) { + FOREACH_THREAD(dx, x, MPA_D1D) { MASS3DPA_7 } } - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D) { + FOREACH_THREAD(dy, y, MPA_D1D) { + FOREACH_THREAD(dx, x, MPA_D1D) { MASS3DPA_8 } } - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D) { + FOREACH_THREAD(dy, y, MPA_D1D) { + FOREACH_THREAD(dx, x, MPA_D1D) { MASS3DPA_9 } } @@ -141,15 +135,15 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { MASS3DPA_0_CPU - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dx) { MASS3DPA_1 } ); // RAJA::expt::loop - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int dx) { MASS3DPA_2 } @@ -159,9 +153,9 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qx) { MASS3DPA_3 } @@ -171,9 +165,9 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qx) { MASS3DPA_4 } @@ -183,9 +177,9 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qx) { MASS3DPA_5 } @@ -195,9 +189,9 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int d) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int q) { MASS3DPA_6 } @@ -207,9 +201,9 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dx) { MASS3DPA_7 } @@ -219,9 +213,9 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dx) { MASS3DPA_8 } @@ -231,9 +225,9 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dx) { MASS3DPA_9 } diff --git a/src/apps/MASS3DPA-Seq.cpp b/src/apps/MASS3DPA-Seq.cpp index 027a9dacb..da54dafda 100644 --- a/src/apps/MASS3DPA-Seq.cpp +++ b/src/apps/MASS3DPA-Seq.cpp @@ -15,14 +15,8 @@ namespace rajaperf { namespace apps { +// Uncomment to add compiler directives for loop unrolling //#define USE_RAJA_UNROLL -#define RAJA_DIRECT_PRAGMA(X) _Pragma(#X) -#if defined(USE_RAJA_UNROLL) -#define RAJA_UNROLL(N) RAJA_DIRECT_PRAGMA(unroll(N)) -#else -#define RAJA_UNROLL(N) -#endif -#define FOREACH_THREAD(i, k, N) for (int i = 0; i < N; i++) void MASS3DPA::runSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -40,53 +34,53 @@ void MASS3DPA::runSeqVariant(VariantID vid) { MASS3DPA_0_CPU - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D){ + FOREACH_THREAD(dy, y, MPA_D1D) { + FOREACH_THREAD(dx, x, MPA_D1D){ MASS3DPA_1 } - FOREACH_THREAD(dx, x, Q1D) { + FOREACH_THREAD(dx, x, MPA_Q1D) { MASS3DPA_2 } } - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(qx, x, Q1D) { + FOREACH_THREAD(dy, y, MPA_D1D) { + FOREACH_THREAD(qx, x, MPA_Q1D) { MASS3DPA_3 } } - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(qx, x, Q1D) { + FOREACH_THREAD(qy, y, MPA_Q1D) { + FOREACH_THREAD(qx, x, MPA_Q1D) { MASS3DPA_4 } } - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(qx, x, Q1D) { + FOREACH_THREAD(qy, y, MPA_Q1D) { + FOREACH_THREAD(qx, x, MPA_Q1D) { MASS3DPA_5 } } - FOREACH_THREAD(d, y, D1D) { - FOREACH_THREAD(q, x, Q1D) { + FOREACH_THREAD(d, y, MPA_D1D) { + FOREACH_THREAD(q, x, MPA_Q1D) { MASS3DPA_6 } } - FOREACH_THREAD(qy, y, Q1D) { - FOREACH_THREAD(dx, x, D1D) { + FOREACH_THREAD(qy, y, MPA_Q1D) { + FOREACH_THREAD(dx, x, MPA_D1D) { MASS3DPA_7 } } - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D) { + FOREACH_THREAD(dy, y, MPA_D1D) { + FOREACH_THREAD(dx, x, MPA_D1D) { MASS3DPA_8 } } - FOREACH_THREAD(dy, y, D1D) { - FOREACH_THREAD(dx, x, D1D) { + FOREACH_THREAD(dy, y, MPA_D1D) { + FOREACH_THREAD(dx, x, MPA_D1D) { MASS3DPA_9 } } @@ -138,15 +132,15 @@ void MASS3DPA::runSeqVariant(VariantID vid) { MASS3DPA_0_CPU - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dx) { MASS3DPA_1 } ); // RAJA::expt::loop - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int dx) { MASS3DPA_2 } @@ -156,9 +150,9 @@ void MASS3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qx) { MASS3DPA_3 } @@ -168,9 +162,9 @@ void MASS3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qx) { MASS3DPA_4 } @@ -180,9 +174,9 @@ void MASS3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qx) { MASS3DPA_5 } @@ -192,9 +186,9 @@ void MASS3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int d) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int q) { MASS3DPA_6 } @@ -204,9 +198,9 @@ void MASS3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dx) { MASS3DPA_7 } @@ -216,9 +210,9 @@ void MASS3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dx) { MASS3DPA_8 } @@ -228,9 +222,9 @@ void MASS3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), [&](int dx) { MASS3DPA_9 } diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp index 6057127dc..9ede00404 100644 --- a/src/apps/MASS3DPA.cpp +++ b/src/apps/MASS3DPA.cpp @@ -25,28 +25,28 @@ MASS3DPA::MASS3DPA(const RunParams& params) { m_NE_default = 8000; - setDefaultProblemSize(m_NE_default*Q1D*Q1D*Q1D); + setDefaultProblemSize(m_NE_default*MPA_Q1D*MPA_Q1D*MPA_Q1D); setDefaultReps(50); - m_NE = std::max(getTargetProblemSize()/(Q1D*Q1D*Q1D), Index_type(1)); + m_NE = std::max(getTargetProblemSize()/(MPA_Q1D*MPA_Q1D*MPA_Q1D), Index_type(1)); - setActualProblemSize( m_NE*Q1D*Q1D*Q1D ); + setActualProblemSize( m_NE*MPA_Q1D*MPA_Q1D*MPA_Q1D ); setItsPerRep(getActualProblemSize()); setKernelsPerRep(1); - setBytesPerRep( Q1D*D1D*sizeof(Real_type) + - Q1D*D1D*sizeof(Real_type) + - Q1D*Q1D*Q1D*m_NE*sizeof(Real_type) + - D1D*D1D*D1D*m_NE*sizeof(Real_type) + - D1D*D1D*D1D*m_NE*sizeof(Real_type) ); - - setFLOPsPerRep(m_NE * (2 * D1D * D1D * D1D * Q1D + - 2 * D1D * D1D * Q1D * Q1D + - 2 * D1D * Q1D * Q1D * Q1D + Q1D * Q1D * Q1D + - 2 * Q1D * Q1D * Q1D * D1D + - 2 * Q1D * Q1D * D1D * D1D + - 2 * Q1D * D1D * D1D * D1D + D1D * D1D * D1D)); + setBytesPerRep( MPA_Q1D*MPA_D1D*sizeof(Real_type) + + MPA_Q1D*MPA_D1D*sizeof(Real_type) + + MPA_Q1D*MPA_Q1D*MPA_Q1D*m_NE*sizeof(Real_type) + + MPA_D1D*MPA_D1D*MPA_D1D*m_NE*sizeof(Real_type) + + MPA_D1D*MPA_D1D*MPA_D1D*m_NE*sizeof(Real_type) ); + + setFLOPsPerRep(m_NE * (2 * MPA_D1D * MPA_D1D * MPA_D1D * MPA_Q1D + + 2 * MPA_D1D * MPA_D1D * MPA_Q1D * MPA_Q1D + + 2 * MPA_D1D * MPA_Q1D * MPA_Q1D * MPA_Q1D + MPA_Q1D * MPA_Q1D * MPA_Q1D + + 2 * MPA_Q1D * MPA_Q1D * MPA_Q1D * MPA_D1D + + 2 * MPA_Q1D * MPA_Q1D * MPA_D1D * MPA_D1D + + 2 * MPA_Q1D * MPA_D1D * MPA_D1D * MPA_D1D + MPA_D1D * MPA_D1D * MPA_D1D)); setUsesFeature(Teams); setVariantDefined( Base_Seq ); @@ -70,16 +70,16 @@ MASS3DPA::~MASS3DPA() void MASS3DPA::setUp(VariantID vid) { - allocAndInitDataConst(m_B, int(Q1D*D1D), Real_type(1.0), vid); - allocAndInitDataConst(m_Bt,int(Q1D*D1D), Real_type(1.0), vid); - allocAndInitDataConst(m_D, int(Q1D*Q1D*Q1D*m_NE), Real_type(1.0), vid); - allocAndInitDataConst(m_X, int(D1D*D1D*D1D*m_NE), Real_type(1.0), vid); - allocAndInitDataConst(m_Y, int(D1D*D1D*D1D*m_NE), Real_type(0.0), vid); + allocAndInitDataConst(m_B, int(MPA_Q1D*MPA_D1D), Real_type(1.0), vid); + allocAndInitDataConst(m_Bt,int(MPA_Q1D*MPA_D1D), Real_type(1.0), vid); + allocAndInitDataConst(m_D, int(MPA_Q1D*MPA_Q1D*MPA_Q1D*m_NE), Real_type(1.0), vid); + allocAndInitDataConst(m_X, int(MPA_D1D*MPA_D1D*MPA_D1D*m_NE), Real_type(1.0), vid); + allocAndInitDataConst(m_Y, int(MPA_D1D*MPA_D1D*MPA_D1D*m_NE), Real_type(0.0), vid); } void MASS3DPA::updateChecksum(VariantID vid) { - checksum[vid] += calcChecksum(m_Y, D1D*D1D*D1D*m_NE); + checksum[vid] += calcChecksum(m_Y, MPA_D1D*MPA_D1D*MPA_D1D*m_NE); } void MASS3DPA::tearDown(VariantID vid) diff --git a/src/apps/MASS3DPA.hpp b/src/apps/MASS3DPA.hpp index 9e2255fdd..76959f33b 100644 --- a/src/apps/MASS3DPA.hpp +++ b/src/apps/MASS3DPA.hpp @@ -7,7 +7,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// /// -/// Action of 3D Mass matrix via partial assembly +/// Action of 3D mass matrix via partial assembly /// /// Based on MFEM's/CEED algorithms. /// Reference implementation @@ -15,8 +15,8 @@ /// /// for (int e = 0; e < NE; ++e) { /// -/// constexpr int MQ1 = Q1D; -/// constexpr int MD1 = D1D; +/// constexpr int MQ1 = MPA_Q1D; +/// constexpr int MD1 = MPA_D1D; /// constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; /// double sDQ[MQ1 * MD1]; /// double(*Bsmem)[MD1] = (double(*)[MD1])sDQ; @@ -30,120 +30,120 @@ /// double(*QQD)[MQ1][MD1] = (double(*)[MQ1][MD1])sm0; /// double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1; /// -/// for(int dy=0; dy MD1) ? MQ1 : MD1; \ double sDQ[MQ1 * MD1]; \ double(*Bsmem)[MD1] = (double(*)[MD1])sDQ; \ @@ -196,8 +197,8 @@ Index_type NE = m_NE; double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1; #define MASS3DPA_0_GPU \ - constexpr int MQ1 = Q1D; \ - constexpr int MD1 = D1D; \ + constexpr int MQ1 = MPA_Q1D; \ + constexpr int MD1 = MPA_D1D; \ constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ RAJA_TEAM_SHARED double sDQ[MQ1 * MD1]; \ double(*Bsmem)[MD1] = (double(*)[MD1])sDQ; \ @@ -213,127 +214,127 @@ Index_type NE = m_NE; #define MASS3DPA_1 \ RAJA_UNROLL(MD1) \ -for (int dz = 0; dz< D1D; ++dz) { \ +for (int dz = 0; dz< MPA_D1D; ++dz) { \ Xsmem[dz][dy][dx] = X_(dx, dy, dz, e); \ } #define MASS3DPA_2 \ Bsmem[dx][dy] = B_(dx, dy); -// 2 * D1D * D1D * D1D * Q1D +// 2 * MPA_D1D * MPA_D1D * MPA_D1D * MPA_Q1D #define MASS3DPA_3 \ - double u[D1D]; \ + double u[MPA_D1D]; \ RAJA_UNROLL(MD1) \ -for (int dz = 0; dz < D1D; dz++) { \ +for (int dz = 0; dz < MPA_D1D; dz++) { \ u[dz] = 0; \ } \ RAJA_UNROLL(MD1) \ -for (int dx = 0; dx < D1D; ++dx) { \ +for (int dx = 0; dx < MPA_D1D; ++dx) { \ RAJA_UNROLL(MD1) \ -for (int dz = 0; dz < D1D; ++dz) { \ +for (int dz = 0; dz < MPA_D1D; ++dz) { \ u[dz] += Xsmem[dz][dy][dx] * Bsmem[qx][dx]; \ } \ } \ RAJA_UNROLL(MD1) \ -for (int dz = 0; dz < D1D; ++dz) { \ +for (int dz = 0; dz < MPA_D1D; ++dz) { \ DDQ[dz][dy][qx] = u[dz]; \ } -//2 * D1D * D1D * Q1D * Q1D +//2 * MPA_D1D * MPA_D1D * MPA_Q1D * MPA_Q1D #define MASS3DPA_4 \ - double u[D1D]; \ + double u[MPA_D1D]; \ RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; dz++) { \ + for (int dz = 0; dz < MPA_D1D; dz++) { \ u[dz] = 0; \ } \ RAJA_UNROLL(MD1) \ - for (int dy = 0; dy < D1D; ++dy) { \ + for (int dy = 0; dy < MPA_D1D; ++dy) { \ RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; dz++) { \ + for (int dz = 0; dz < MPA_D1D; dz++) { \ u[dz] += DDQ[dz][dy][qx] * Bsmem[qy][dy]; \ } \ } \ RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; dz++) { \ + for (int dz = 0; dz < MPA_D1D; dz++) { \ DQQ[dz][qy][qx] = u[dz]; \ } -//2 * D1D * Q1D * Q1D * Q1D + Q1D * Q1D * Q1D +//2 * MPA_D1D * MPA_Q1D * MPA_Q1D * MPA_Q1D + MPA_Q1D * MPA_Q1D * MPA_Q1D #define MASS3DPA_5 \ - double u[Q1D]; \ + double u[MPA_Q1D]; \ RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; qz++) { \ + for (int qz = 0; qz < MPA_Q1D; qz++) { \ u[qz] = 0; \ } \ RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; ++dz) { \ + for (int dz = 0; dz < MPA_D1D; ++dz) { \ RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; qz++) { \ + for (int qz = 0; qz < MPA_Q1D; qz++) { \ u[qz] += DQQ[dz][qy][qx] * Bsmem[qz][dz]; \ } \ } \ RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; qz++) { \ + for (int qz = 0; qz < MPA_Q1D; qz++) { \ QQQ[qz][qy][qx] = u[qz] * D_(qx, qy, qz, e); \ } #define MASS3DPA_6 \ Btsmem[d][q] = Bt_(q, d); -//2 * Q1D * Q1D * Q1D * D1D +//2 * MPA_Q1D * MPA_Q1D * MPA_Q1D * MPA_D1D #define MASS3DPA_7 \ - double u[Q1D]; \ + double u[MPA_Q1D]; \ RAJA_UNROLL(MQ1) \ -for (int qz = 0; qz < Q1D; ++qz) { \ +for (int qz = 0; qz < MPA_Q1D; ++qz) { \ u[qz] = 0; \ } \ RAJA_UNROLL(MQ1) \ -for (int qx = 0; qx < Q1D; ++qx) { \ +for (int qx = 0; qx < MPA_Q1D; ++qx) { \ RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; ++qz) { \ + for (int qz = 0; qz < MPA_Q1D; ++qz) { \ u[qz] += QQQ[qz][qy][qx] * Btsmem[dx][qx]; \ } \ } \ RAJA_UNROLL(MQ1) \ -for (int qz = 0; qz < Q1D; ++qz) { \ +for (int qz = 0; qz < MPA_Q1D; ++qz) { \ QQD[qz][qy][dx] = u[qz]; \ } -// 2 * Q1D * Q1D * D1D * D1D +// 2 * MPA_Q1D * MPA_Q1D * MPA_D1D * MPA_D1D #define MASS3DPA_8 \ - double u[Q1D]; \ + double u[MPA_Q1D]; \ RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; ++qz) { \ + for (int qz = 0; qz < MPA_Q1D; ++qz) { \ u[qz] = 0; \ } \ RAJA_UNROLL(MQ1) \ - for (int qy = 0; qy < Q1D; ++qy) { \ + for (int qy = 0; qy < MPA_Q1D; ++qy) { \ RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; ++qz) { \ + for (int qz = 0; qz < MPA_Q1D; ++qz) { \ u[qz] += QQD[qz][qy][dx] * Btsmem[dy][qy]; \ } \ } \ RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; ++qz) { \ + for (int qz = 0; qz < MPA_Q1D; ++qz) { \ QDD[qz][dy][dx] = u[qz]; \ } -//2 * Q1D * D1D * D1D * D1D + D1D * D1D * D1D +//2 * MPA_Q1D * MPA_D1D * MPA_D1D * MPA_D1D + MPA_D1D * MPA_D1D * MPA_D1D #define MASS3DPA_9 \ - double u[D1D]; \ + double u[MPA_D1D]; \ RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; ++dz) { \ + for (int dz = 0; dz < MPA_D1D; ++dz) { \ u[dz] = 0; \ } \ RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < Q1D; ++qz) { \ + for (int qz = 0; qz < MPA_Q1D; ++qz) { \ RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; ++dz) { \ + for (int dz = 0; dz < MPA_D1D; ++dz) { \ u[dz] += QDD[qz][dy][dx] * Btsmem[dz][qz]; \ } \ } \ RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < D1D; ++dz) { \ + for (int dz = 0; dz < MPA_D1D; ++dz) { \ Y_(dx, dy, dz, e) += u[dz]; \ } From 7e88fda41238136d7747b839ead115032baedf2b Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 30 Sep 2021 13:35:46 -0700 Subject: [PATCH 127/392] fix macros for hipcc/clang builds --- src/apps/DIFFUSION3DPA-Cuda.cpp | 34 ++++++++++++++++----------------- src/apps/DIFFUSION3DPA-Hip.cpp | 34 ++++++++++++++++----------------- src/apps/DIFFUSION3DPA-OMP.cpp | 34 ++++++++++++++++----------------- src/apps/DIFFUSION3DPA-Seq.cpp | 34 ++++++++++++++++----------------- src/apps/FEM_MACROS.hpp | 12 ++++++++---- src/apps/MASS3DPA-Cuda.cpp | 34 ++++++++++++++++----------------- src/apps/MASS3DPA-Hip.cpp | 34 ++++++++++++++++----------------- src/apps/MASS3DPA-OMP.cpp | 34 ++++++++++++++++----------------- src/apps/MASS3DPA-Seq.cpp | 34 ++++++++++++++++----------------- 9 files changed, 144 insertions(+), 140 deletions(-) diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index 8aa12d5c4..2c7edbe23 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -44,54 +44,54 @@ __global__ void Diffusion3DPA(Index_type NE, const Real_ptr Basis, const Real_pt DIFFUSION3DPA_0_GPU; - FOREACH_THREAD(dy, y, DPA_D1D) { - FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD(dx, x, DPA_D1D) { DIFFUSION3DPA_1; } - FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { DIFFUSION3DPA_2; } } __syncthreads(); - FOREACH_THREAD(dy, y, DPA_D1D) { - FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { DIFFUSION3DPA_3; } } __syncthreads(); - FOREACH_THREAD(qy, y, DPA_Q1D) { - FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { + GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { DIFFUSION3DPA_4; } } __syncthreads(); - FOREACH_THREAD(qy, y, DPA_Q1D) { - FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { + GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { DIFFUSION3DPA_5; } } __syncthreads(); - FOREACH_THREAD(d, y, DPA_D1D) { - FOREACH_THREAD(q, x, DPA_Q1D) { + GPU_FOREACH_THREAD(d, y, DPA_D1D) { + GPU_FOREACH_THREAD(q, x, DPA_Q1D) { DIFFUSION3DPA_6; } } __syncthreads(); - FOREACH_THREAD(qy, y, DPA_Q1D) { - FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { + GPU_FOREACH_THREAD(dx, x, DPA_D1D) { DIFFUSION3DPA_7; } } __syncthreads(); - FOREACH_THREAD(dy, y, DPA_D1D) { - FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD(dx, x, DPA_D1D) { DIFFUSION3DPA_8; } } __syncthreads(); - FOREACH_THREAD(dy, y, DPA_D1D) { - FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD(dx, x, DPA_D1D) { DIFFUSION3DPA_9; } } diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index 1a3060ac0..945f571d6 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -44,54 +44,54 @@ __global__ void Diffusion3DPA(Index_type NE, const Real_ptr Basis, const Real_pt DIFFUSION3DPA_0_GPU; - FOREACH_THREAD(dy, y, DPA_D1D) { - FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD(dx, x, DPA_D1D) { DIFFUSION3DPA_1; } - FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { DIFFUSION3DPA_2; } } __syncthreads(); - FOREACH_THREAD(dy, y, DPA_D1D) { - FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { DIFFUSION3DPA_3; } } __syncthreads(); - FOREACH_THREAD(qy, y, DPA_Q1D) { - FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { + GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { DIFFUSION3DPA_4; } } __syncthreads(); - FOREACH_THREAD(qy, y, DPA_Q1D) { - FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { + GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { DIFFUSION3DPA_5; } } __syncthreads(); - FOREACH_THREAD(d, y, DPA_D1D) { - FOREACH_THREAD(q, x, DPA_Q1D) { + GPU_FOREACH_THREAD(d, y, DPA_D1D) { + GPU_FOREACH_THREAD(q, x, DPA_Q1D) { DIFFUSION3DPA_6; } } __syncthreads(); - FOREACH_THREAD(qy, y, DPA_Q1D) { - FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { + GPU_FOREACH_THREAD(dx, x, DPA_D1D) { DIFFUSION3DPA_7; } } __syncthreads(); - FOREACH_THREAD(dy, y, DPA_D1D) { - FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD(dx, x, DPA_D1D) { DIFFUSION3DPA_8; } } __syncthreads(); - FOREACH_THREAD(dy, y, DPA_D1D) { - FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD(dx, x, DPA_D1D) { DIFFUSION3DPA_9; } } diff --git a/src/apps/DIFFUSION3DPA-OMP.cpp b/src/apps/DIFFUSION3DPA-OMP.cpp index 02a10c450..6a5e5b59a 100644 --- a/src/apps/DIFFUSION3DPA-OMP.cpp +++ b/src/apps/DIFFUSION3DPA-OMP.cpp @@ -37,54 +37,54 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { DIFFUSION3DPA_0_CPU; - FOREACH_THREAD(dy, y, DPA_D1D) { - FOREACH_THREAD(dx, x, DPA_D1D) { + CPU_FOREACH(dy, y, DPA_D1D) { + CPU_FOREACH(dx, x, DPA_D1D) { DIFFUSION3DPA_1; } - FOREACH_THREAD(qx, x, DPA_Q1D) { + CPU_FOREACH(qx, x, DPA_Q1D) { DIFFUSION3DPA_2; } } - FOREACH_THREAD(dy, y, DPA_D1D) { - FOREACH_THREAD(qx, x, DPA_Q1D) { + CPU_FOREACH(dy, y, DPA_D1D) { + CPU_FOREACH(qx, x, DPA_Q1D) { DIFFUSION3DPA_3; } } - FOREACH_THREAD(qy, y, DPA_Q1D) { - FOREACH_THREAD(qx, x, DPA_Q1D) { + CPU_FOREACH(qy, y, DPA_Q1D) { + CPU_FOREACH(qx, x, DPA_Q1D) { DIFFUSION3DPA_4; } } - FOREACH_THREAD(qy, y, DPA_Q1D) { - FOREACH_THREAD(qx, x, DPA_Q1D) { + CPU_FOREACH(qy, y, DPA_Q1D) { + CPU_FOREACH(qx, x, DPA_Q1D) { DIFFUSION3DPA_5; } } - FOREACH_THREAD(d, y, DPA_D1D) { - FOREACH_THREAD(q, x, DPA_Q1D) { + CPU_FOREACH(d, y, DPA_D1D) { + CPU_FOREACH(q, x, DPA_Q1D) { DIFFUSION3DPA_6; } } - FOREACH_THREAD(qy, y, DPA_Q1D) { - FOREACH_THREAD(dx, x, DPA_D1D) { + CPU_FOREACH(qy, y, DPA_Q1D) { + CPU_FOREACH(dx, x, DPA_D1D) { DIFFUSION3DPA_7; } } - FOREACH_THREAD(dy, y, DPA_D1D) { - FOREACH_THREAD(dx, x, DPA_D1D) { + CPU_FOREACH(dy, y, DPA_D1D) { + CPU_FOREACH(dx, x, DPA_D1D) { DIFFUSION3DPA_8; } } - FOREACH_THREAD(dy, y, DPA_D1D) { - FOREACH_THREAD(dx, x, DPA_D1D) { + CPU_FOREACH(dy, y, DPA_D1D) { + CPU_FOREACH(dx, x, DPA_D1D) { DIFFUSION3DPA_9; } } diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index 35ecd74bc..8c3626ad9 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -34,54 +34,54 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { DIFFUSION3DPA_0_CPU; - FOREACH_THREAD(dy, y, DPA_D1D) { - FOREACH_THREAD(dx, x, DPA_D1D) { + CPU_FOREACH(dy, y, DPA_D1D) { + CPU_FOREACH(dx, x, DPA_D1D) { DIFFUSION3DPA_1; } - FOREACH_THREAD(qx, x, DPA_Q1D) { + CPU_FOREACH(qx, x, DPA_Q1D) { DIFFUSION3DPA_2; } } - FOREACH_THREAD(dy, y, DPA_D1D) { - FOREACH_THREAD(qx, x, DPA_Q1D) { + CPU_FOREACH(dy, y, DPA_D1D) { + CPU_FOREACH(qx, x, DPA_Q1D) { DIFFUSION3DPA_3; } } - FOREACH_THREAD(qy, y, DPA_Q1D) { - FOREACH_THREAD(qx, x, DPA_Q1D) { + CPU_FOREACH(qy, y, DPA_Q1D) { + CPU_FOREACH(qx, x, DPA_Q1D) { DIFFUSION3DPA_4; } } - FOREACH_THREAD(qy, y, DPA_Q1D) { - FOREACH_THREAD(qx, x, DPA_Q1D) { + CPU_FOREACH(qy, y, DPA_Q1D) { + CPU_FOREACH(qx, x, DPA_Q1D) { DIFFUSION3DPA_5; } } - FOREACH_THREAD(d, y, DPA_D1D) { - FOREACH_THREAD(q, x, DPA_Q1D) { + CPU_FOREACH(d, y, DPA_D1D) { + CPU_FOREACH(q, x, DPA_Q1D) { DIFFUSION3DPA_6; } } - FOREACH_THREAD(qy, y, DPA_Q1D) { - FOREACH_THREAD(dx, x, DPA_D1D) { + CPU_FOREACH(qy, y, DPA_Q1D) { + CPU_FOREACH(dx, x, DPA_D1D) { DIFFUSION3DPA_7; } } - FOREACH_THREAD(dy, y, DPA_D1D) { - FOREACH_THREAD(dx, x, DPA_D1D) { + CPU_FOREACH(dy, y, DPA_D1D) { + CPU_FOREACH(dx, x, DPA_D1D) { DIFFUSION3DPA_8; } } - FOREACH_THREAD(dy, y, DPA_D1D) { - FOREACH_THREAD(dx, x, DPA_D1D) { + CPU_FOREACH(dy, y, DPA_D1D) { + CPU_FOREACH(dx, x, DPA_D1D) { DIFFUSION3DPA_9; } } diff --git a/src/apps/FEM_MACROS.hpp b/src/apps/FEM_MACROS.hpp index bbf19b02a..8a0b1b400 100644 --- a/src/apps/FEM_MACROS.hpp +++ b/src/apps/FEM_MACROS.hpp @@ -15,11 +15,15 @@ #define RAJA_UNROLL(N) #endif -#if defined(RAJA_DEVICE_CODE) -#define FOREACH_THREAD(i, k, N) \ +// Need two different host/device macros due to +// how hipcc/clang works. +// See note in MAT_MAT_SHARED regarding hipcc/clang +// builds. +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) +#define GPU_FOREACH_THREAD(i, k, N) \ for (int i = threadIdx.k; i < N; i += blockDim.k) -#else -#define FOREACH_THREAD(i, k, N) for (int i = 0; i < N; i++) #endif +#define CPU_FOREACH(i, k, N) for (int i = 0; i < N; i++) + #endif // closing endif for header file include guard diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp index 7353ee1f1..448cbea18 100644 --- a/src/apps/MASS3DPA-Cuda.cpp +++ b/src/apps/MASS3DPA-Cuda.cpp @@ -44,57 +44,57 @@ __global__ void Mass3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt, MASS3DPA_0_GPU - FOREACH_THREAD(dy, y, MPA_D1D) { - FOREACH_THREAD(dx, x, MPA_D1D){ + GPU_FOREACH_THREAD(dy, y, MPA_D1D) { + GPU_FOREACH_THREAD(dx, x, MPA_D1D){ MASS3DPA_1 } - FOREACH_THREAD(dx, x, MPA_Q1D) { + GPU_FOREACH_THREAD(dx, x, MPA_Q1D) { MASS3DPA_2 } } __syncthreads(); - FOREACH_THREAD(dy, y, MPA_D1D) { - FOREACH_THREAD(qx, x, MPA_Q1D) { + GPU_FOREACH_THREAD(dy, y, MPA_D1D) { + GPU_FOREACH_THREAD(qx, x, MPA_Q1D) { MASS3DPA_3 } } __syncthreads(); - FOREACH_THREAD(qy, y, MPA_Q1D) { - FOREACH_THREAD(qx, x, MPA_Q1D) { + GPU_FOREACH_THREAD(qy, y, MPA_Q1D) { + GPU_FOREACH_THREAD(qx, x, MPA_Q1D) { MASS3DPA_4 } } __syncthreads(); - FOREACH_THREAD(qy, y, MPA_Q1D) { - FOREACH_THREAD(qx, x, MPA_Q1D) { + GPU_FOREACH_THREAD(qy, y, MPA_Q1D) { + GPU_FOREACH_THREAD(qx, x, MPA_Q1D) { MASS3DPA_5 } } __syncthreads(); - FOREACH_THREAD(d, y, MPA_D1D) { - FOREACH_THREAD(q, x, MPA_Q1D) { + GPU_FOREACH_THREAD(d, y, MPA_D1D) { + GPU_FOREACH_THREAD(q, x, MPA_Q1D) { MASS3DPA_6 } } __syncthreads(); - FOREACH_THREAD(qy, y, MPA_Q1D) { - FOREACH_THREAD(dx, x, MPA_D1D) { + GPU_FOREACH_THREAD(qy, y, MPA_Q1D) { + GPU_FOREACH_THREAD(dx, x, MPA_D1D) { MASS3DPA_7 } } __syncthreads(); - FOREACH_THREAD(dy, y, MPA_D1D) { - FOREACH_THREAD(dx, x, MPA_D1D) { + GPU_FOREACH_THREAD(dy, y, MPA_D1D) { + GPU_FOREACH_THREAD(dx, x, MPA_D1D) { MASS3DPA_8 } } __syncthreads(); - FOREACH_THREAD(dy, y, MPA_D1D) { - FOREACH_THREAD(dx, x, MPA_D1D) { + GPU_FOREACH_THREAD(dy, y, MPA_D1D) { + GPU_FOREACH_THREAD(dx, x, MPA_D1D) { MASS3DPA_9 } } diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index 07cbb29b9..396269433 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -44,57 +44,57 @@ __global__ void Mass3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt, MASS3DPA_0_GPU - FOREACH_THREAD(dy, y, MPA_D1D) { - FOREACH_THREAD(dx, x, MPA_D1D){ + GPU_FOREACH_THREAD(dy, y, MPA_D1D) { + GPU_FOREACH_THREAD(dx, x, MPA_D1D){ MASS3DPA_1 } - FOREACH_THREAD(dx, x, MPA_Q1D) { + GPU_FOREACH_THREAD(dx, x, MPA_Q1D) { MASS3DPA_2 } } __syncthreads(); - FOREACH_THREAD(dy, y, MPA_D1D) { - FOREACH_THREAD(qx, x, MPA_Q1D) { + GPU_FOREACH_THREAD(dy, y, MPA_D1D) { + GPU_FOREACH_THREAD(qx, x, MPA_Q1D) { MASS3DPA_3 } } __syncthreads(); - FOREACH_THREAD(qy, y, MPA_Q1D) { - FOREACH_THREAD(qx, x, MPA_Q1D) { + GPU_FOREACH_THREAD(qy, y, MPA_Q1D) { + GPU_FOREACH_THREAD(qx, x, MPA_Q1D) { MASS3DPA_4 } } __syncthreads(); - FOREACH_THREAD(qy, y, MPA_Q1D) { - FOREACH_THREAD(qx, x, MPA_Q1D) { + GPU_FOREACH_THREAD(qy, y, MPA_Q1D) { + GPU_FOREACH_THREAD(qx, x, MPA_Q1D) { MASS3DPA_5 } } __syncthreads(); - FOREACH_THREAD(d, y, MPA_D1D) { - FOREACH_THREAD(q, x, MPA_Q1D) { + GPU_FOREACH_THREAD(d, y, MPA_D1D) { + GPU_FOREACH_THREAD(q, x, MPA_Q1D) { MASS3DPA_6 } } __syncthreads(); - FOREACH_THREAD(qy, y, MPA_Q1D) { - FOREACH_THREAD(dx, x, MPA_D1D) { + GPU_FOREACH_THREAD(qy, y, MPA_Q1D) { + GPU_FOREACH_THREAD(dx, x, MPA_D1D) { MASS3DPA_7 } } __syncthreads(); - FOREACH_THREAD(dy, y, MPA_D1D) { - FOREACH_THREAD(dx, x, MPA_D1D) { + GPU_FOREACH_THREAD(dy, y, MPA_D1D) { + GPU_FOREACH_THREAD(dx, x, MPA_D1D) { MASS3DPA_8 } } __syncthreads(); - FOREACH_THREAD(dy, y, MPA_D1D) { - FOREACH_THREAD(dx, x, MPA_D1D) { + GPU_FOREACH_THREAD(dy, y, MPA_D1D) { + GPU_FOREACH_THREAD(dx, x, MPA_D1D) { MASS3DPA_9 } } diff --git a/src/apps/MASS3DPA-OMP.cpp b/src/apps/MASS3DPA-OMP.cpp index d69ef9427..bca35a662 100644 --- a/src/apps/MASS3DPA-OMP.cpp +++ b/src/apps/MASS3DPA-OMP.cpp @@ -37,53 +37,53 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { MASS3DPA_0_CPU - FOREACH_THREAD(dy, y, MPA_D1D) { - FOREACH_THREAD(dx, x, MPA_D1D){ + CPU_FOREACH(dy, y, MPA_D1D) { + CPU_FOREACH(dx, x, MPA_D1D){ MASS3DPA_1 } - FOREACH_THREAD(dx, x, MPA_Q1D) { + CPU_FOREACH(dx, x, MPA_Q1D) { MASS3DPA_2 } } - FOREACH_THREAD(dy, y, MPA_D1D) { - FOREACH_THREAD(qx, x, MPA_Q1D) { + CPU_FOREACH(dy, y, MPA_D1D) { + CPU_FOREACH(qx, x, MPA_Q1D) { MASS3DPA_3 } } - FOREACH_THREAD(qy, y, MPA_Q1D) { - FOREACH_THREAD(qx, x, MPA_Q1D) { + CPU_FOREACH(qy, y, MPA_Q1D) { + CPU_FOREACH(qx, x, MPA_Q1D) { MASS3DPA_4 } } - FOREACH_THREAD(qy, y, MPA_Q1D) { - FOREACH_THREAD(qx, x, MPA_Q1D) { + CPU_FOREACH(qy, y, MPA_Q1D) { + CPU_FOREACH(qx, x, MPA_Q1D) { MASS3DPA_5 } } - FOREACH_THREAD(d, y, MPA_D1D) { - FOREACH_THREAD(q, x, MPA_Q1D) { + CPU_FOREACH(d, y, MPA_D1D) { + CPU_FOREACH(q, x, MPA_Q1D) { MASS3DPA_6 } } - FOREACH_THREAD(qy, y, MPA_Q1D) { - FOREACH_THREAD(dx, x, MPA_D1D) { + CPU_FOREACH(qy, y, MPA_Q1D) { + CPU_FOREACH(dx, x, MPA_D1D) { MASS3DPA_7 } } - FOREACH_THREAD(dy, y, MPA_D1D) { - FOREACH_THREAD(dx, x, MPA_D1D) { + CPU_FOREACH(dy, y, MPA_D1D) { + CPU_FOREACH(dx, x, MPA_D1D) { MASS3DPA_8 } } - FOREACH_THREAD(dy, y, MPA_D1D) { - FOREACH_THREAD(dx, x, MPA_D1D) { + CPU_FOREACH(dy, y, MPA_D1D) { + CPU_FOREACH(dx, x, MPA_D1D) { MASS3DPA_9 } } diff --git a/src/apps/MASS3DPA-Seq.cpp b/src/apps/MASS3DPA-Seq.cpp index da54dafda..70f2fd846 100644 --- a/src/apps/MASS3DPA-Seq.cpp +++ b/src/apps/MASS3DPA-Seq.cpp @@ -34,53 +34,53 @@ void MASS3DPA::runSeqVariant(VariantID vid) { MASS3DPA_0_CPU - FOREACH_THREAD(dy, y, MPA_D1D) { - FOREACH_THREAD(dx, x, MPA_D1D){ + CPU_FOREACH(dy, y, MPA_D1D) { + CPU_FOREACH(dx, x, MPA_D1D){ MASS3DPA_1 } - FOREACH_THREAD(dx, x, MPA_Q1D) { + CPU_FOREACH(dx, x, MPA_Q1D) { MASS3DPA_2 } } - FOREACH_THREAD(dy, y, MPA_D1D) { - FOREACH_THREAD(qx, x, MPA_Q1D) { + CPU_FOREACH(dy, y, MPA_D1D) { + CPU_FOREACH(qx, x, MPA_Q1D) { MASS3DPA_3 } } - FOREACH_THREAD(qy, y, MPA_Q1D) { - FOREACH_THREAD(qx, x, MPA_Q1D) { + CPU_FOREACH(qy, y, MPA_Q1D) { + CPU_FOREACH(qx, x, MPA_Q1D) { MASS3DPA_4 } } - FOREACH_THREAD(qy, y, MPA_Q1D) { - FOREACH_THREAD(qx, x, MPA_Q1D) { + CPU_FOREACH(qy, y, MPA_Q1D) { + CPU_FOREACH(qx, x, MPA_Q1D) { MASS3DPA_5 } } - FOREACH_THREAD(d, y, MPA_D1D) { - FOREACH_THREAD(q, x, MPA_Q1D) { + CPU_FOREACH(d, y, MPA_D1D) { + CPU_FOREACH(q, x, MPA_Q1D) { MASS3DPA_6 } } - FOREACH_THREAD(qy, y, MPA_Q1D) { - FOREACH_THREAD(dx, x, MPA_D1D) { + CPU_FOREACH(qy, y, MPA_Q1D) { + CPU_FOREACH(dx, x, MPA_D1D) { MASS3DPA_7 } } - FOREACH_THREAD(dy, y, MPA_D1D) { - FOREACH_THREAD(dx, x, MPA_D1D) { + CPU_FOREACH(dy, y, MPA_D1D) { + CPU_FOREACH(dx, x, MPA_D1D) { MASS3DPA_8 } } - FOREACH_THREAD(dy, y, MPA_D1D) { - FOREACH_THREAD(dx, x, MPA_D1D) { + CPU_FOREACH(dy, y, MPA_D1D) { + CPU_FOREACH(dx, x, MPA_D1D) { MASS3DPA_9 } } From 4a3811d537157a35c5c3cb1841ef62f15714ec74 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Thu, 30 Sep 2021 15:02:16 -0700 Subject: [PATCH 128/392] taking out unnecessary flag in lassen gitlab ci file --- .gitlab/lassen-jobs.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index 0a3d0ad0a..8554ff1ad 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -12,7 +12,6 @@ clang_11_0_0: variables: SPEC: "%clang@11.0.0" - MULTI_PROJECT: "On" extends: .build_and_test_on_lassen clang_11_gcc_8: From 81996080fd1102537ffe422f238f24659cf9732e Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 30 Sep 2021 16:45:19 -0700 Subject: [PATCH 129/392] move define before the include --- src/apps/DIFFUSION3DPA-Cuda.cpp | 6 +++--- src/apps/DIFFUSION3DPA-Hip.cpp | 6 +++--- src/apps/DIFFUSION3DPA-OMP.cpp | 5 +++-- src/apps/DIFFUSION3DPA-Seq.cpp | 5 +++-- src/apps/MASS3DPA-Cuda.cpp | 6 +++--- src/apps/MASS3DPA-Hip.cpp | 6 +++--- src/apps/MASS3DPA-OMP.cpp | 5 +++-- src/apps/MASS3DPA-Seq.cpp | 5 +++-- 8 files changed, 24 insertions(+), 20 deletions(-) diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index 2c7edbe23..61b0d0798 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -6,6 +6,9 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Uncomment to add compiler directives for loop unrolling +//#define USE_RAJA_UNROLL + #include "DIFFUSION3DPA.hpp" #include "RAJA/RAJA.hpp" @@ -34,9 +37,6 @@ namespace apps { deallocCudaDeviceData(X); \ deallocCudaDeviceData(Y); -// Uncomment to add compiler directives for loop unrolling -//#define USE_RAJA_UNROLL - __global__ void Diffusion3DPA(Index_type NE, const Real_ptr Basis, const Real_ptr dBasis, const Real_ptr D, const Real_ptr X, Real_ptr Y, bool symmetric) { diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index 945f571d6..81bc7e323 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -6,6 +6,9 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Uncomment to add compiler directives for loop unrolling +//#define USE_RAJA_UNROLL + #include "DIFFUSION3DPA.hpp" #include "RAJA/RAJA.hpp" @@ -34,9 +37,6 @@ namespace apps { deallocHipDeviceData(X); \ deallocHipDeviceData(Y); -// Uncomment to add compiler directives for loop unrolling -//#define USE_RAJA_UNROLL - __global__ void Diffusion3DPA(Index_type NE, const Real_ptr Basis, const Real_ptr dBasis, const Real_ptr D, const Real_ptr X, Real_ptr Y, bool symmetric) { diff --git a/src/apps/DIFFUSION3DPA-OMP.cpp b/src/apps/DIFFUSION3DPA-OMP.cpp index 6a5e5b59a..1d1b42cc2 100644 --- a/src/apps/DIFFUSION3DPA-OMP.cpp +++ b/src/apps/DIFFUSION3DPA-OMP.cpp @@ -6,6 +6,9 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Uncomment to add compiler directives for loop unrolling +//#define USE_RAJA_UNROLL + #include "DIFFUSION3DPA.hpp" #include "RAJA/RAJA.hpp" @@ -15,8 +18,6 @@ namespace rajaperf { namespace apps { -// Uncomment to add compiler directives for loop unrolling -//#define USE_RAJA_UNROLL void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index 8c3626ad9..396c6bbc2 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -6,6 +6,9 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Uncomment to add compiler directives for loop unrolling +//#define USE_RAJA_UNROLL + #include "DIFFUSION3DPA.hpp" #include "RAJA/RAJA.hpp" @@ -15,8 +18,6 @@ namespace rajaperf { namespace apps { -// Uncomment to add compiler directives for loop unrolling -//#define USE_RAJA_UNROLL void DIFFUSION3DPA::runSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp index 448cbea18..533ee5dd8 100644 --- a/src/apps/MASS3DPA-Cuda.cpp +++ b/src/apps/MASS3DPA-Cuda.cpp @@ -6,6 +6,9 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Uncomment to add compiler directives loop unrolling +//#define USE_RAJA_UNROLL + #include "MASS3DPA.hpp" #include "RAJA/RAJA.hpp" @@ -34,9 +37,6 @@ namespace apps { deallocCudaDeviceData(X); \ deallocCudaDeviceData(Y); -// Uncomment to add compiler directives loop unrolling -//#define USE_RAJA_UNROLL - __global__ void Mass3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt, const Real_ptr D, const Real_ptr X, Real_ptr Y) { diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index 396269433..12caf28c1 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -6,6 +6,9 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Uncomment to add compiler directives loop unrolling +//#define USE_RAJA_UNROLL + #include "MASS3DPA.hpp" #include "RAJA/RAJA.hpp" @@ -34,9 +37,6 @@ namespace apps { deallocHipDeviceData(X); \ deallocHipDeviceData(Y); -// Uncomment to add compiler directives loop unrolling -//#define USE_RAJA_UNROLL - __global__ void Mass3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt, const Real_ptr D, const Real_ptr X, Real_ptr Y) { diff --git a/src/apps/MASS3DPA-OMP.cpp b/src/apps/MASS3DPA-OMP.cpp index bca35a662..3aa09157c 100644 --- a/src/apps/MASS3DPA-OMP.cpp +++ b/src/apps/MASS3DPA-OMP.cpp @@ -6,6 +6,9 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Uncomment to add compiler directives loop unrolling +//#define USE_RAJA_UNROLL + #include "MASS3DPA.hpp" #include "RAJA/RAJA.hpp" @@ -15,8 +18,6 @@ namespace rajaperf { namespace apps { -// Uncomment to add compiler directives for loop unrolling -//#define USE_RAJA_UNROLL void MASS3DPA::runOpenMPVariant(VariantID vid) { diff --git a/src/apps/MASS3DPA-Seq.cpp b/src/apps/MASS3DPA-Seq.cpp index 70f2fd846..66bd41db3 100644 --- a/src/apps/MASS3DPA-Seq.cpp +++ b/src/apps/MASS3DPA-Seq.cpp @@ -6,6 +6,9 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Uncomment to add compiler directives for loop unrolling +//#define USE_RAJA_UNROLL + #include "MASS3DPA.hpp" #include "RAJA/RAJA.hpp" @@ -15,8 +18,6 @@ namespace rajaperf { namespace apps { -// Uncomment to add compiler directives for loop unrolling -//#define USE_RAJA_UNROLL void MASS3DPA::runSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); From 3d4658c622a0be50225c218967dc0e282d87723f Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Fri, 1 Oct 2021 08:20:45 -0700 Subject: [PATCH 130/392] adding comment to submodule update --- scripts/gitlab/build_and_test.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 511244806..16655ac33 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -116,6 +116,9 @@ then if [[ -n ${raja_version} ]] then cd tpl/RAJA + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~ Updating RAJA Submodule to develop ~~~" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" git pull origin develop cd - fi From df8a27483963ca226fa198aaf5135a23598aa15e Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 4 Oct 2021 15:37:49 -0700 Subject: [PATCH 131/392] Fix HEAT_3D index mapping Reorder the hip and cuda RAJA variant index ordering to get coalesced loads and stores --- src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp | 6 +++--- src/polybench/POLYBENCH_HEAT_3D-Hip.cpp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp index a757a5e0e..aaad94dcf 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp @@ -159,11 +159,11 @@ void POLYBENCH_HEAT_3D::runCudaVariant(VariantID vid) RAJA::statement::CudaKernelFixedAsync, RAJA::cuda_block_y_direct, - RAJA::statement::Tile<0, RAJA::tile_fixed, + RAJA::statement::Tile<2, RAJA::tile_fixed, RAJA::cuda_block_x_direct, - RAJA::statement::For<2, RAJA::cuda_block_z_direct, // i + RAJA::statement::For<0, RAJA::cuda_block_z_direct, // i RAJA::statement::For<1, RAJA::cuda_thread_y_direct, // j - RAJA::statement::For<0, RAJA::cuda_thread_x_direct, // k + RAJA::statement::For<2, RAJA::cuda_thread_x_direct, // k RAJA::statement::Lambda<0> > > diff --git a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp index 545b70368..c76e9cfe1 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp @@ -168,11 +168,11 @@ void POLYBENCH_HEAT_3D::runHipVariant(VariantID vid) RAJA::statement::HipKernelFixedAsync, RAJA::hip_block_y_direct, - RAJA::statement::Tile<0, RAJA::tile_fixed, + RAJA::statement::Tile<2, RAJA::tile_fixed, RAJA::hip_block_x_direct, - RAJA::statement::For<2, RAJA::hip_block_z_direct, // i + RAJA::statement::For<0, RAJA::hip_block_z_direct, // i RAJA::statement::For<1, RAJA::hip_thread_y_direct, // j - RAJA::statement::For<0, RAJA::hip_thread_x_direct, // k + RAJA::statement::For<2, RAJA::hip_thread_x_direct, // k RAJA::statement::Lambda<0> > > From f299c1e92737dffc09d2702e63695fa95118294a Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Wed, 6 Oct 2021 14:08:31 -0700 Subject: [PATCH 132/392] adding a submodule update --- scripts/gitlab/build_and_test.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 16655ac33..36b080699 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -120,6 +120,10 @@ then echo "~~~~ Updating RAJA Submodule to develop ~~~" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" git pull origin develop + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~ Updating Submodules within RAJA ~~~~~~" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + git submodule init && git submodule update --recursive cd - fi From fb2bcaee573c4ead8db3b89fa35358c029f94acd Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 19 Oct 2021 20:52:53 -0700 Subject: [PATCH 133/392] Add cmake variable for block sizes and update config Note that rajaperf_config is not actually used and broken. That is why some parts are if 0'd out. --- CMakeLists.txt | 12 +++++++++++- src/rajaperf_config.hpp.in | 12 ++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4fc8c256a..c2223d108 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,10 +42,18 @@ set(ENABLE_TBB Off CACHE BOOL "") set(RAJA_USE_CHRONO On CACHE BOOL "") +set(RAJA_PERFSUITE_GPU_BLOCKSIZES "" CACHE STRING "Comma separated list of GPU block sizes, ex '256,1024'") + set(RAJA_RANGE_ALIGN 4) set(RAJA_RANGE_MIN_LENGTH 32) set(RAJA_DATA_ALIGN 64) +string(LENGTH "${RAJA_PERFSUITE_GPU_BLOCKSIZES}" BLOCKSIZES_LENGTH) +if (BLOCKSIZES_LENGTH GREATER 0) + message(STATUS "Using gpu block size(s): ${RAJA_PERFSUITE_GPU_BLOCKSIZES}") +else() + message(STATUS "Using default gpu block size(s)") +endif() # exclude RAJA make targets from top-level build... add_subdirectory(tpl/RAJA) @@ -109,7 +117,9 @@ endif() configure_file(${CMAKE_SOURCE_DIR}/src/rajaperf_config.hpp.in ${CMAKE_CURRENT_BINARY_DIR}/bin/rajaperf_config.hpp) -# Make sure RAJA flag propagate (we need to do some house cleaning to +include_directories($) + +# Make sure RAJA flag propagate (we need to do some house cleaning to # remove project-specific CMake variables that are no longer needed) set (CUDA_NVCC_FLAGS ${RAJA_NVCC_FLAGS}) diff --git a/src/rajaperf_config.hpp.in b/src/rajaperf_config.hpp.in index c34f9120c..d1516f984 100644 --- a/src/rajaperf_config.hpp.in +++ b/src/rajaperf_config.hpp.in @@ -21,13 +21,14 @@ #define RAJAPerf_config_HPP #include "RAJA/config.hpp" +#include "camp/number.hpp" #include namespace rajaperf { struct configuration { - +#if 0 // Version of RAJA Perf Suite (ex: 0.1.0) static const std::string perfsuite_version = "@RAJA_PERFSUITE_VERSION_MAJOR@" + std::string(".") + @@ -49,7 +50,14 @@ static const std::string compiler = "@RAJAPERF_COMPILER@"; // Command options used to build (ex: -Ofast -mavx) static const std::string compiler_options = "@RAJAPERF_COMPILER_OPTIONS@"; - +#endif + +// helper alias to void trailing comma in no-arg case +template < size_t... Is > +using i_seq = camp::int_seq; +// List of GPU block sizes +using gpu_block_sizes = i_seq<@RAJA_PERFSUITE_GPU_BLOCKSIZES@>; + // Name of user who ran code std::string user_run; From e4f5bda0257eff3349eb08aa35261f324bc2264b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 19 Oct 2021 20:56:42 -0700 Subject: [PATCH 134/392] Add --gpu_block_size argument, kernels get as part of params My current thinking is that all kernels will have a default block_size that will be used if no gpu_block_size is set. If a block_size is set then any kernels that do not support that block size will not run. Kernels have a method to indicate if they support a certain block_size or not, by default kernels only support the default block size. --- src/common/Executor.cpp | 72 +++++++++++++++++++----------------- src/common/Executor.hpp | 2 + src/common/KernelBase.cpp | 63 ++++++++++++++++--------------- src/common/KernelBase.hpp | 19 ++++++++++ src/common/RAJAPerfSuite.cpp | 65 +++++++++++++++++++++++++++++--- src/common/RAJAPerfSuite.hpp | 26 +++++++++---- src/common/RunParams.cpp | 27 ++++++++++++++ src/common/RunParams.hpp | 4 ++ 8 files changed, 202 insertions(+), 76 deletions(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index cc32c6cf6..c2240967c 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -577,6 +577,11 @@ void Executor::reportRunSummary(ostream& str) const } else if (run_params.getSizeMeaning() == RunParams::SizeMeaning::Direct) { str << "\t Kernel size = " << run_params.getSize() << endl; } + if (run_params.getGPUBlockSize() > 0) { + str << "\t Kernel GPU block_size = " << run_params.getGPUBlockSize() << endl; + } else { + str << "\t Kernel GPU block_size = " << "default" << endl; + } str << "\t Kernel rep factor = " << run_params.getRepFactor() << endl; str << "\t Output files will be named " << ofiles << endl; @@ -613,6 +618,7 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const Index_type itsrep_width = 0; Index_type bytesrep_width = 0; Index_type flopsrep_width = 0; + Index_type bsize_width = 0; Index_type dash_width = 0; for (size_t ik = 0; ik < kernels.size(); ++ik) { @@ -622,6 +628,7 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const itsrep_width = max(reps_width, kernels[ik]->getItsPerRep()); bytesrep_width = max(bytesrep_width, kernels[ik]->getBytesPerRep()); flopsrep_width = max(bytesrep_width, kernels[ik]->getFLOPsPerRep()); + bsize_width = max(bsize_width, static_cast(kernels[ik]->getActualGPUBlockSize())); } const string sepchr(" , "); @@ -665,13 +672,20 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const static_cast(frsize) ) + 3; dash_width += flopsrep_width + static_cast(sepchr.size()); + double bsize = log10( static_cast(bsize_width) ); + string bsize_head("GPU block size"); + bsize_width = max( static_cast(bsize_head.size()), + static_cast(bsize) ) + 3; + dash_width += bsize_width + static_cast(sepchr.size()); + str <getKernelsPerRep() << sepchr <getBytesPerRep() << sepchr <getFLOPsPerRep() + << sepchr <getActualGPUBlockSize() << endl; } @@ -715,21 +730,9 @@ void Executor::runSuite() for (size_t ik = 0; ik < warmup_kernels.size(); ++ik) { KernelBase* warmup_kernel = warmup_kernels[ik]; cout << "Kernel : " << warmup_kernel->getName() << endl; - for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - VariantID vid = variant_ids[iv]; - if ( run_params.showProgress() ) { - if ( warmup_kernel->hasVariantDefined(vid) ) { - cout << " Running "; - } else { - cout << " No "; - } - cout << getVariantName(vid) << " variant" << endl; - } - if ( warmup_kernel->hasVariantDefined(vid) ) { - warmup_kernel->execute(vid); - } - } - delete warmup_kernels[ik]; + runKernel(warmup_kernel); + delete warmup_kernel; + warmup_kernels[ik] = nullptr; } @@ -746,29 +749,32 @@ void Executor::runSuite() if ( run_params.showProgress() ) { std::cout << "\nRun kernel -- " << kernel->getName() << "\n"; } - - for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - VariantID vid = variant_ids[iv]; - KernelBase* kern = kernels[ik]; - if ( run_params.showProgress() ) { - if ( kern->hasVariantDefined(vid) ) { - cout << " Running "; - } else { - cout << " No "; - } - cout << getVariantName(vid) << " variant" << endl; - } - if ( kern->hasVariantDefined(vid) ) { - kernels[ik]->execute(vid); - } - } // loop over variants - + runKernel(kernel); } // loop over kernels } // loop over passes through suite } +void Executor::runKernel(KernelBase* kern) +{ + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + VariantID vid = variant_ids[iv]; + + if ( run_params.showProgress() ) { + if ( kern->hasVariantDefined(vid) ) { + cout << " Running "; + } else { + cout << " No "; + } + cout << kern->getVariantName(vid) << " variant" << endl; + } + if ( kern->hasVariantDefined(vid) ) { + kern->execute(vid); + } + } // loop over variants +} + void Executor::outputRunData() { RunParams::InputOpt in_state = run_params.getInputState(); diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp index 32e978f9a..9f588c7a7 100644 --- a/src/common/Executor.hpp +++ b/src/common/Executor.hpp @@ -58,6 +58,8 @@ class Executor std::vector variants; }; + void runKernel(KernelBase* kern); + bool haveReferenceVariant() { return reference_vid < NumVariants; } void writeKernelInfoSummary(std::ostream& str, bool to_file) const; diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 69d195700..abd040918 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -15,16 +15,18 @@ namespace rajaperf { KernelBase::KernelBase(KernelID kid, const RunParams& params) : - run_params(params) + run_params(params) { kernel_id = kid; name = getFullKernelName(kernel_id); default_prob_size = -1; default_reps = -1; + default_gpu_block_size = 0; actual_prob_size = -1; - + actual_gpu_block_size = 0; + for (size_t fid = 0; fid < NumFeatures; ++fid) { uses_feature[fid] = false; } @@ -51,17 +53,17 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params) : } } - + KernelBase::~KernelBase() { } Index_type KernelBase::getTargetProblemSize() const -{ +{ Index_type target_size = static_cast(0); if (run_params.getSizeMeaning() == RunParams::SizeMeaning::Factor) { - target_size = + target_size = static_cast(default_prob_size*run_params.getSizeFactor()); } else if (run_params.getSizeMeaning() == RunParams::SizeMeaning::Direct) { target_size = static_cast(run_params.getSize()); @@ -70,23 +72,24 @@ Index_type KernelBase::getTargetProblemSize() const } Index_type KernelBase::getRunReps() const -{ +{ Index_type run_reps = static_cast(0); if (run_params.getInputState() == RunParams::CheckRun) { run_reps = static_cast(run_params.getCheckRunReps()); } else { - run_reps = static_cast(default_reps*run_params.getRepFactor()); + run_reps = static_cast(default_reps*run_params.getRepFactor()); } return run_reps; } -void KernelBase::setVariantDefined(VariantID vid) +void KernelBase::setVariantDefined(VariantID vid) { - has_variant_defined[vid] = isVariantAvailable(vid); + has_variant_defined[vid] = isVariantAvailable(vid) && + ( isVariantGPU(vid) ? isGPUBlockSizeSupported() + : true ); } - -void KernelBase::execute(VariantID vid) +void KernelBase::execute(VariantID vid) { running_variant = vid; @@ -94,14 +97,14 @@ void KernelBase::execute(VariantID vid) resetDataInitCount(); this->setUp(vid); - - this->runKernel(vid); - this->updateChecksum(vid); + this->runKernel(vid); + + this->updateChecksum(vid); this->tearDown(vid); - running_variant = NumVariants; + running_variant = NumVariants; } void KernelBase::recordExecTime() @@ -178,7 +181,7 @@ void KernelBase::runKernel(VariantID vid) default : { #if 0 - std::cout << "\n " << getName() + std::cout << "\n " << getName() << " : Unknown variant id = " << vid << std::endl; #endif } @@ -195,13 +198,13 @@ void KernelBase::print(std::ostream& os) const os << "\t\t\t actual_prob_size = " << actual_prob_size << std::endl; os << "\t\t\t uses_feature: " << std::endl; for (unsigned j = 0; j < NumFeatures; ++j) { - os << "\t\t\t\t" << getFeatureName(static_cast(j)) - << " : " << uses_feature[j] << std::endl; + os << "\t\t\t\t" << getFeatureName(static_cast(j)) + << " : " << uses_feature[j] << std::endl; } os << "\t\t\t has_variant_defined: " << std::endl; for (unsigned j = 0; j < NumVariants; ++j) { - os << "\t\t\t\t" << getVariantName(static_cast(j)) - << " : " << has_variant_defined[j] << std::endl; + os << "\t\t\t\t" << getVariantName(static_cast(j)) + << " : " << has_variant_defined[j] << std::endl; } os << "\t\t\t its_per_rep = " << its_per_rep << std::endl; os << "\t\t\t kernels_per_rep = " << kernels_per_rep << std::endl; @@ -209,28 +212,28 @@ void KernelBase::print(std::ostream& os) const os << "\t\t\t FLOPs_per_rep = " << FLOPs_per_rep << std::endl; os << "\t\t\t num_exec: " << std::endl; for (unsigned j = 0; j < NumVariants; ++j) { - os << "\t\t\t\t" << getVariantName(static_cast(j)) - << " : " << num_exec[j] << std::endl; + os << "\t\t\t\t" << getVariantName(static_cast(j)) + << " : " << num_exec[j] << std::endl; } os << "\t\t\t min_time: " << std::endl; for (unsigned j = 0; j < NumVariants; ++j) { - os << "\t\t\t\t" << getVariantName(static_cast(j)) - << " : " << min_time[j] << std::endl; + os << "\t\t\t\t" << getVariantName(static_cast(j)) + << " : " << min_time[j] << std::endl; } os << "\t\t\t max_time: " << std::endl; for (unsigned j = 0; j < NumVariants; ++j) { - os << "\t\t\t\t" << getVariantName(static_cast(j)) - << " : " << max_time[j] << std::endl; + os << "\t\t\t\t" << getVariantName(static_cast(j)) + << " : " << max_time[j] << std::endl; } os << "\t\t\t tot_time: " << std::endl; for (unsigned j = 0; j < NumVariants; ++j) { - os << "\t\t\t\t" << getVariantName(static_cast(j)) - << " : " << tot_time[j] << std::endl; + os << "\t\t\t\t" << getVariantName(static_cast(j)) + << " : " << tot_time[j] << std::endl; } os << "\t\t\t checksum: " << std::endl; for (unsigned j = 0; j < NumVariants; ++j) { - os << "\t\t\t\t" << getVariantName(static_cast(j)) - << " : " << checksum[j] << std::endl; + os << "\t\t\t\t" << getVariantName(static_cast(j)) + << " : " << checksum[j] << std::endl; } os << std::endl; } diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 8c2dfb799..bc9b0dbab 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -52,6 +52,8 @@ class KernelBase void setDefaultProblemSize(Index_type size) { default_prob_size = size; } void setActualProblemSize(Index_type size) { actual_prob_size = size; } + void setDefaultGPUBlockSize(size_t size) { default_gpu_block_size = size; } + void setActualGPUBlockSize(size_t size) { actual_gpu_block_size = size; } void setDefaultReps(Index_type reps) { default_reps = reps; } void setItsPerRep(Index_type its) { its_per_rep = its; }; void setKernelsPerRep(Index_type nkerns) { kernels_per_rep = nkerns; }; @@ -68,6 +70,8 @@ class KernelBase Index_type getDefaultProblemSize() const { return default_prob_size; } Index_type getActualProblemSize() const { return actual_prob_size; } + size_t getDefaultGPUBlockSize() const { return default_gpu_block_size; } + size_t getActualGPUBlockSize() const { return actual_gpu_block_size; } Index_type getDefaultReps() const { return default_reps; } Index_type getItsPerRep() const { return its_per_rep; }; Index_type getKernelsPerRep() const { return kernels_per_rep; }; @@ -82,6 +86,19 @@ class KernelBase bool hasVariantDefined(VariantID vid) const { return has_variant_defined[vid]; } + std::string getVariantName(VariantID vid) const + { + if (isVariantGPU(vid) && actual_gpu_block_size > 0) { + return rajaperf::getVariantName(vid) + std::string("_") + std::to_string(actual_gpu_block_size); + } else { + return rajaperf::getVariantName(vid); + } + } + + virtual bool isGPUBlockSizeSupported() const + { + return default_gpu_block_size == actual_gpu_block_size; + } // // Methods to get information about kernel execution for reports @@ -175,8 +192,10 @@ class KernelBase Index_type default_prob_size; Index_type default_reps; + size_t default_gpu_block_size; Index_type actual_prob_size; + size_t actual_gpu_block_size; bool uses_feature[NumFeatures]; diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 748fb1325..f80a497ed 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -348,7 +348,7 @@ const std::string& getVariantName(VariantID vid) /*! ******************************************************************************* * - * Return true if variant associated with VariantID enum value is available + * Return true if variant associated with VariantID enum value is available * to run; else false. * ******************************************************************************* @@ -361,22 +361,22 @@ bool isVariantAvailable(VariantID vid) ret_val = true; } #if defined(RUN_RAJA_SEQ) - if ( vid == Lambda_Seq || + if ( vid == Lambda_Seq || vid == RAJA_Seq ) { ret_val = true; } #endif #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - if ( vid == Base_OpenMP || - vid == Lambda_OpenMP || + if ( vid == Base_OpenMP || + vid == Lambda_OpenMP || vid == RAJA_OpenMP ) { ret_val = true; } #endif #if defined(RAJA_ENABLE_TARGET_OPENMP) - if ( vid == Base_OpenMPTarget || + if ( vid == Base_OpenMPTarget || vid == RAJA_OpenMPTarget ) { ret_val = true; } @@ -401,6 +401,61 @@ bool isVariantAvailable(VariantID vid) return ret_val; } +/*! + ******************************************************************************* + * + * Return true if variant associated with VariantID enum value runs on the GPU. + * + ******************************************************************************* + */ +bool isVariantGPU(VariantID vid) +{ + bool ret_val = false; + + if ( vid == Base_Seq ) { + ret_val = false; + } +#if defined(RUN_RAJA_SEQ) + if ( vid == Lambda_Seq || + vid == RAJA_Seq ) { + ret_val = false; + } +#endif + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + if ( vid == Base_OpenMP || + vid == Lambda_OpenMP || + vid == RAJA_OpenMP ) { + ret_val = false; + } +#endif + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + if ( vid == Base_OpenMPTarget || + vid == RAJA_OpenMPTarget ) { + ret_val = false; + } +#endif + +#if defined(RAJA_ENABLE_CUDA) + if ( vid == Base_CUDA || + vid == Lambda_CUDA || + vid == RAJA_CUDA ) { + ret_val = true; + } +#endif + +#if defined(RAJA_ENABLE_HIP) + if ( vid == Base_HIP || + vid == Lambda_HIP || + vid == RAJA_HIP ) { + ret_val = true; + } +#endif + + return ret_val; +} + /* ******************************************************************************* * diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index ca4f10f1d..c6d4484cf 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -31,8 +31,8 @@ class RunParams; * * IMPORTANT: This is only modified when a group is added or removed. * - * ENUM VALUES MUST BE KEPT CONSISTENT (CORRESPONDING ONE-TO-ONE) - * WITH ARRAY OF GROUP NAMES IN IMPLEMENTATION FILE!!! + * ENUM VALUES MUST BE KEPT CONSISTENT (CORRESPONDING ONE-TO-ONE) + * WITH ARRAY OF GROUP NAMES IN IMPLEMENTATION FILE!!! * ******************************************************************************* */ @@ -58,8 +58,8 @@ enum GroupID { * * IMPORTANT: This is only modified when a kernel is added or removed. * - * ENUM VALUES MUST BE KEPT CONSISTENT (CORRESPONDING ONE-TO-ONE) - * WITH ARRAY OF KERNEL NAMES IN IMPLEMENTATION FILE!!! + * ENUM VALUES MUST BE KEPT CONSISTENT (CORRESPONDING ONE-TO-ONE) + * WITH ARRAY OF KERNEL NAMES IN IMPLEMENTATION FILE!!! * ******************************************************************************* */ @@ -157,7 +157,7 @@ enum KernelID { * IMPORTANT: This is only modified when a new variant is added to the suite. * * IT MUST BE KEPT CONSISTENT (CORRESPONDING ONE-TO-ONE) WITH - * ARRAY OF VARIANT NAMES IN IMPLEMENTATION FILE!!! + * ARRAY OF VARIANT NAMES IN IMPLEMENTATION FILE!!! * ******************************************************************************* */ @@ -207,7 +207,7 @@ enum FeatureID { Sort, Scan, - Workgroup, + Workgroup, Reduction, Atomic, @@ -257,18 +257,28 @@ const std::string& getFullKernelName(KernelID kid); * ******************************************************************************* */ -const std::string& getVariantName(VariantID vid); +const std::string& getVariantName(VariantID vid); /*! ******************************************************************************* * - * \brief Return true if variant associated with VariantID enum value is + * \brief Return true if variant associated with VariantID enum value is * available * to run; else false. * ******************************************************************************* */ bool isVariantAvailable(VariantID vid); +/*! + ******************************************************************************* + * + * \brief Return true if variant associated with VariantID enum value runs + * on the gpu. + * + ******************************************************************************* + */ +bool isVariantGPU(VariantID vid); + /*! ******************************************************************************* * diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index e038863c1..ce08514be 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -33,6 +33,7 @@ RunParams::RunParams(int argc, char** argv) size_meaning(SizeMeaning::Unset), size(0.0), size_factor(0.0), + gpu_block_size(0), pf_tol(0.1), checkrun_reps(1), reference_variant(), @@ -82,6 +83,7 @@ void RunParams::print(std::ostream& str) const str << "\n size_meaning = " << SizeMeaningToStr(getSizeMeaning()); str << "\n size = " << size; str << "\n size_factor = " << size_factor; + str << "\n gpu_block_size = " << gpu_block_size; str << "\n pf_tol = " << pf_tol; str << "\n checkrun_reps = " << checkrun_reps; str << "\n reference_variant = " << reference_variant; @@ -281,6 +283,24 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) input_state = BadInput; } + } else if ( opt == std::string("--gpu_block_size") ) { + + i++; + if ( i < argc ) { + gpu_block_size = ::atoi( argv[i] ); + if ( gpu_block_size <= 0 ) { + std::cout << "\nBad input:" + << " must give --gpu_block_size a POSITIVE value (int)" + << std::endl; + input_state = BadInput; + } + } else { + std::cout << "\nBad input:" + << " must give --gpu_block_size a value (int)" + << std::endl; + input_state = BadInput; + } + } else if ( opt == std::string("--pass-fail-tol") || opt == std::string("-pftol") ) { @@ -513,6 +533,13 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t\t Example...\n" << "\t\t --size 1000000 (runs kernels with size ~1,000,000)\n\n"; + str << "\t --gpu_block_size [no default]\n" + << "\t (block size to run for all GPU kernels)\n" + << "\t (GPU kernels not supporting gpu_block_size will be skipped)\n" + << "\t (Support is determined by kernel implementation and cmake variable RAJA_PERFSUITE_GPU_BLOCKSIZES)\n"; + str << "\t\t Example...\n" + << "\t\t --gpu_block_size 256 (runs kernels with gpu_block_size 256)\n\n"; + str << "\t --pass-fail-tol, -pftol [default is 0.1; i.e., 10%]\n" << "\t (slowdown tolerance for RAJA vs. Base variants in FOM report)\n"; str << "\t\t Example...\n" diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index c25e58342..aa4633b94 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -97,6 +97,8 @@ class RunParams { double getSizeFactor() const { return size_factor; } + size_t getGPUBlockSize() const { return gpu_block_size; } + double getPFTolerance() const { return pf_tol; } int getCheckRunReps() const { return checkrun_reps; } @@ -184,6 +186,8 @@ class RunParams { double size; /*!< kernel size to run (input option) */ double size_factor; /*!< default kernel size multipier (input option) */ + size_t gpu_block_size; /*!< GPU block size to run (input option) */ + double pf_tol; /*!< pct RAJA variant run time can exceed base for each PM case to pass/fail acceptance */ From 0bf354b2f5dc0e124a323fd030bfd7b389d54909 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 21 Oct 2021 14:10:17 -0700 Subject: [PATCH 135/392] Add utils to support block sizes in gpu variants These classes help instantiate the implementation once for each block size in the list. Also add versions of lambda cuda/hip launch functions supporting compile time block sizes. --- src/common/CudaDataUtils.hpp | 20 ++++- src/common/GPUUtils.hpp | 147 +++++++++++++++++++++++++++++++++++ src/common/HipDataUtils.hpp | 36 +++++++-- src/common/KernelBase.hpp | 1 + 4 files changed, 196 insertions(+), 8 deletions(-) create mode 100644 src/common/GPUUtils.hpp diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index 0467d0f19..6edce4298 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -18,6 +18,7 @@ #if defined(RAJA_ENABLE_CUDA) +#include "common/GPUUtils.hpp" #include "RAJA/policy/cuda/raja_cudaerrchk.hpp" @@ -36,6 +37,16 @@ __global__ void lambda_cuda_forall(Index_type ibegin, Index_type iend, Lambda bo body(i); } } +/// +template < size_t block_size, typename Lambda > +__launch_bounds__(block_size) +__global__ void lambda_cuda_forall(Index_type ibegin, Index_type iend, Lambda body) +{ + Index_type i = ibegin + blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + body(i); + } +} /*! * \brief Simple cuda kernel that runs a lambda. @@ -43,7 +54,14 @@ __global__ void lambda_cuda_forall(Index_type ibegin, Index_type iend, Lambda bo template < typename Lambda > __global__ void lambda_cuda(Lambda body) { - body(); + body(); +} +/// +template < size_t block_size, typename Lambda > +__launch_bounds__(block_size) +__global__ void lambda_cuda(Lambda body) +{ + body(); } /*! diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp new file mode 100644 index 000000000..69d933e5f --- /dev/null +++ b/src/common/GPUUtils.hpp @@ -0,0 +1,147 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// Methods and classes for GPU kernel templates. +/// + + +#ifndef RAJAPerf_GPUUtils_HPP +#define RAJAPerf_GPUUtils_HPP + +#include "rajaperf_config.hpp" + + +namespace rajaperf +{ + +namespace gpu_block_size +{ + +namespace detail +{ + +// helpers to invoke f with each integer in the param pack +template < typename F > +bool invoke_or_helper(F) +{ + return false; +} +/// +template < typename F, size_t I, size_t... Is> +bool invoke_or_helper(F f) +{ + return f.template operator()() || invoke_or_helper(f); +} + +// class to get the size of a camp::int_seq +template < typename IntSeq > +struct SizeOfIntSeq; +/// +template < size_t... Is > +struct SizeOfIntSeq> +{ + static const size_t size = sizeof...(Is); +}; + +} // namespace detail + +// call f's call operator with each integer as the template param in turn +// stopping at the first integer that returns true. +// return true if any f() returns true, otherwise return false +template < typename F, size_t... Is > +bool invoke_or(F f, camp::int_seq) +{ + return detail::invoke_or_helper(f); +} + +// if the given integer is the same as the template param block_size +// returns true otherwise returns false +struct Equals +{ + Equals(size_t actual_gpu_block_size) + : m_actual_gpu_block_size(actual_gpu_block_size) + {} + + template < size_t block_size > + bool operator()() { return m_actual_gpu_block_size == block_size; } + +private: + size_t m_actual_gpu_block_size; +}; + +// if the kernel's actual block size is the same as the template param +// runs the cuda variant with the template param block_size and returns true +// otherwise returns false +template < typename Kernel > +struct RunCudaBlockSize +{ + RunCudaBlockSize(Kernel& kernel, VariantID vid) + : m_kernel(kernel), m_vid(vid) + {} + + template < size_t block_size > + bool operator()() { + if (block_size == m_kernel.getActualGPUBlockSize()) { + m_kernel.template runCudaVariantImpl(m_vid); + return true; + } + return false; + } + +private: + Kernel& m_kernel; + VariantID m_vid; +}; + +// if the kernel's actual block size is the same as the template param +// runs the hip variant with the template param block_size and returns true +// otherwise returns false +template < typename Kernel > +struct RunHipBlockSize +{ + RunHipBlockSize(Kernel& kernel, VariantID vid) + : m_kernel(kernel), m_vid(vid) + {} + + template < size_t block_size > + bool operator()() { + if (block_size == m_kernel.getActualGPUBlockSize()) { + m_kernel.template runHipVariantImpl(m_vid); + return true; + } + return false; + } + +private: + Kernel& m_kernel; + VariantID m_vid; +}; + +// return the first integer in the int_seq +template < size_t I, size_t... Is > +size_t get_first(camp::int_seq) +{ + return I; +} + +// A camp::int_seq of size_t's that is rajaperf::configuration::gpu_block_sizes +// if rajaperf::configuration::gpu_block_sizes is not empty +// and a camp::int_seq of default_block_size otherwise +template < size_t default_block_size > +using list_type = + typename std::conditional< (detail::SizeOfIntSeq::size > 0), + rajaperf::configuration::gpu_block_sizes, + camp::int_seq + >::type; + +} // closing brace for gpu_block_size namespace + +} // closing brace for rajaperf namespace + +#endif // closing endif for header file include guard diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index 550563d2f..9b5f6e2e3 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -18,6 +18,7 @@ #if defined(RAJA_ENABLE_HIP) +#include "common/GPUUtils.hpp" #include "RAJA/policy/hip/raja_hiperrchk.hpp" @@ -31,16 +32,37 @@ namespace rajaperf template < typename Lambda > __global__ void lambda_hip_forall(Index_type ibegin, Index_type iend, Lambda body) { - Index_type i = ibegin + blockIdx.x * blockDim.x + threadIdx.x; - if (i < iend) { - body(i); - } + Index_type i = ibegin + blockIdx.x * blockDim.x + threadIdx.x; + if (i < iend) { + body(i); + } +} +/// +template < size_t block_size, typename Lambda > +__launch_bounds__(block_size) +__global__ void lambda_hip_forall(Index_type ibegin, Index_type iend, Lambda body) +{ + Index_type i = ibegin + blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + body(i); + } } /*! -* \brief Simple hip kernel that runs a lambda. -*/ -template __global__ void lambda_hip(Lambda body) { body(); } + * \brief Simple hip kernel that runs a lambda. + */ +template < typename Lambda > +__global__ void lambda_hip(Lambda body) +{ + body(); +} +/// +template < size_t block_size, typename Lambda > +__launch_bounds__(block_size) +__global__ void lambda_hip(Lambda body) +{ + body(); +} /*! * \brief Getters for hip kernel indices. diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index bc9b0dbab..af7fa9721 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -13,6 +13,7 @@ #include "common/RPTypes.hpp" #include "common/DataUtils.hpp" #include "common/RunParams.hpp" +#include "common/GPUUtils.hpp" #include "RAJA/util/Timer.hpp" #if defined(RAJA_ENABLE_CUDA) From 99acce315256d5bffe84bc1450be7cc2ddb73c6c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 22 Oct 2021 08:57:06 -0700 Subject: [PATCH 136/392] Add HALOEXCHANGE_FUSED to warmup kernels This captures the overhead of initializing the RAJA pinned memory pool. --- src/common/Executor.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index c2240967c..f5a2a7653 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -16,6 +16,7 @@ #include "basic/DAXPY.hpp" #include "basic/REDUCE3_INT.hpp" #include "algorithm/SORT.hpp" +#include "apps/HALOEXCHANGE_FUSED.hpp" #include #include @@ -726,6 +727,7 @@ void Executor::runSuite() warmup_kernels.push_back(new basic::DAXPY(run_params)); warmup_kernels.push_back(new basic::REDUCE3_INT(run_params)); warmup_kernels.push_back(new algorithm::SORT(run_params)); + warmup_kernels.push_back(new apps::HALOEXCHANGE_FUSED(run_params)); for (size_t ik = 0; ik < warmup_kernels.size(); ++ik) { KernelBase* warmup_kernel = warmup_kernels[ik]; From 2c85699d14efb55c5379b9dcf9dd81dba7f6e657 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sat, 23 Oct 2021 14:21:39 -0700 Subject: [PATCH 137/392] Fix GEN_LIN_RECUS gpu kernel2 It was using kernel 1 inadvertantly --- src/lcals/GEN_LIN_RECUR-Cuda.cpp | 2 +- src/lcals/GEN_LIN_RECUR-Hip.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lcals/GEN_LIN_RECUR-Cuda.cpp b/src/lcals/GEN_LIN_RECUR-Cuda.cpp index 53793af75..3a8daa793 100644 --- a/src/lcals/GEN_LIN_RECUR-Cuda.cpp +++ b/src/lcals/GEN_LIN_RECUR-Cuda.cpp @@ -83,7 +83,7 @@ void GEN_LIN_RECUR::runCudaVariant(VariantID vid) cudaErrchk( cudaGetLastError() ); const size_t grid_size2 = RAJA_DIVIDE_CEILING_INT(N+1, block_size); - genlinrecur1<<>>( b5, stb5, sa, sb, + genlinrecur2<<>>( b5, stb5, sa, sb, kb5i, N ); cudaErrchk( cudaGetLastError() ); diff --git a/src/lcals/GEN_LIN_RECUR-Hip.cpp b/src/lcals/GEN_LIN_RECUR-Hip.cpp index 7d96b27f4..c11326881 100644 --- a/src/lcals/GEN_LIN_RECUR-Hip.cpp +++ b/src/lcals/GEN_LIN_RECUR-Hip.cpp @@ -84,7 +84,7 @@ void GEN_LIN_RECUR::runHipVariant(VariantID vid) hipErrchk( hipGetLastError() ); const size_t grid_size2 = RAJA_DIVIDE_CEILING_INT(N+1, block_size); - hipLaunchKernelGGL(genlinrecur1, grid_size2, block_size, 0, 0, + hipLaunchKernelGGL(genlinrecur2, grid_size2, block_size, 0, 0, b5, stb5, sa, sb, kb5i, N ); From 2b4d431457c32764624ee27c12511fa42e8bcc15 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 21 Oct 2021 15:11:04 -0700 Subject: [PATCH 138/392] Support block size in DOT and HEAT_3D Support multiple block sizes by recursive template instantiation using gpu utils. --- src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp | 22 ++++++++++++++++------ src/polybench/POLYBENCH_HEAT_3D-Hip.cpp | 22 ++++++++++++++++------ src/polybench/POLYBENCH_HEAT_3D.cpp | 12 +++++++++++- src/polybench/POLYBENCH_HEAT_3D.hpp | 9 +++++++++ src/stream/DOT-Cuda.cpp | 18 +++++++++++------- src/stream/DOT-Hip.cpp | 18 +++++++++++------- src/stream/DOT.cpp | 12 +++++++++++- src/stream/DOT.hpp | 9 +++++++++ 8 files changed, 94 insertions(+), 28 deletions(-) diff --git a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp index aaad94dcf..1e30f098e 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp @@ -22,11 +22,11 @@ namespace polybench { // - // Define thread block size for CUDA execution + // Define thread block shape for CUDA execution // - constexpr size_t i_block_sz = 1; - constexpr size_t j_block_sz = 8; - constexpr size_t k_block_sz = 32; +#define k_block_sz (32) +#define j_block_sz (block_size / k_block_sz) +#define i_block_sz (1) #define HEAT_3D_THREADS_PER_BLOCK_CUDA \ dim3 nthreads_per_block(k_block_sz, j_block_sz, i_block_sz); @@ -39,7 +39,8 @@ namespace polybench #define POLYBENCH_HEAT_3D_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(A, m_Ainit, m_N*m_N*m_N); \ - allocAndInitCudaDeviceData(B, m_Binit, m_N*m_N*m_N); + allocAndInitCudaDeviceData(B, m_Binit, m_N*m_N*m_N); \ + static_assert(k_block_sz*j_block_sz*i_block_sz == block_size, "Invalid block_size"); #define POLYBENCH_HEAT_3D_TEARDOWN_CUDA \ @@ -84,7 +85,8 @@ __global__ void poly_heat_3D_lam(Index_type N, Lambda body) } -void POLYBENCH_HEAT_3D::runCudaVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -205,7 +207,15 @@ void POLYBENCH_HEAT_3D::runCudaVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_HEAT_3D : Unknown Cuda variant id = " << vid << std::endl; } +} +void POLYBENCH_HEAT_3D::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_HEAT_3D : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp index c76e9cfe1..2bee7df49 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp @@ -22,11 +22,11 @@ namespace polybench { // - // Define thread block size for Hip execution + // Define thread block shape for Hip execution // - constexpr size_t i_block_sz = 1; - constexpr size_t j_block_sz = 8; - constexpr size_t k_block_sz = 32; +#define k_block_sz (32) +#define j_block_sz (block_size / k_block_sz) +#define i_block_sz (1) #define HEAT_3D_THREADS_PER_BLOCK_HIP \ dim3 nthreads_per_block(k_block_sz, j_block_sz, i_block_sz); @@ -39,7 +39,8 @@ namespace polybench #define POLYBENCH_HEAT_3D_DATA_SETUP_HIP \ allocAndInitHipDeviceData(A, m_Ainit, m_N*m_N*m_N); \ - allocAndInitHipDeviceData(B, m_Binit, m_N*m_N*m_N); + allocAndInitHipDeviceData(B, m_Binit, m_N*m_N*m_N); \ + static_assert(k_block_sz*j_block_sz*i_block_sz == block_size, "Invalid block_size"); #define POLYBENCH_HEAT_3D_TEARDOWN_HIP \ @@ -84,7 +85,8 @@ __global__ void poly_heat_3D_lam(Index_type N, Lambda body) } -void POLYBENCH_HEAT_3D::runHipVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -213,7 +215,15 @@ void POLYBENCH_HEAT_3D::runHipVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_HEAT_3D : Unknown Hip variant id = " << vid << std::endl; } +} +void POLYBENCH_HEAT_3D::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_HEAT_3D : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp index 85fd0ce38..126ec4ef7 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D.cpp @@ -22,6 +22,10 @@ namespace polybench POLYBENCH_HEAT_3D::POLYBENCH_HEAT_3D(const RunParams& params) : KernelBase(rajaperf::Polybench_HEAT_3D, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + Index_type N_default = 100; setDefaultProblemSize( (N_default-2)*(N_default-2)*(N_default-2) ); @@ -36,7 +40,7 @@ POLYBENCH_HEAT_3D::POLYBENCH_HEAT_3D(const RunParams& params) setItsPerRep( m_tsteps * ( 2 * getActualProblemSize() ) ); setKernelsPerRep( m_tsteps * 2 ); setBytesPerRep( m_tsteps * ( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * - (m_N-2) * (m_N-2) * (m_N-2) + + (m_N-2) * (m_N-2) * (m_N-2) + (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * (m_N * m_N * m_N - 12*(m_N-2) - 8) + (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * @@ -100,5 +104,11 @@ void POLYBENCH_HEAT_3D::tearDown(VariantID vid) deallocData(m_Binit); } +bool POLYBENCH_HEAT_3D::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_HEAT_3D.hpp b/src/polybench/POLYBENCH_HEAT_3D.hpp index b21b56576..bb3189f7d 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.hpp +++ b/src/polybench/POLYBENCH_HEAT_3D.hpp @@ -125,7 +125,16 @@ class POLYBENCH_HEAT_3D : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Index_type m_N; Index_type m_tsteps; diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index ebeb2ca3a..5d0b071bc 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -22,12 +22,6 @@ namespace rajaperf namespace stream { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define DOT_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(a, m_a, iend); \ allocAndInitCudaDeviceData(b, m_b, iend); @@ -70,7 +64,8 @@ __global__ void dot(Real_ptr a, Real_ptr b, } -void DOT::runCudaVariant(VariantID vid) +template < size_t block_size > +void DOT::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -135,6 +130,15 @@ void DOT::runCudaVariant(VariantID vid) } } +void DOT::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n DOT : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index 47d4ad9b5..ec2827a05 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -22,12 +22,6 @@ namespace rajaperf namespace stream { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define DOT_DATA_SETUP_HIP \ allocAndInitHipDeviceData(a, m_a, iend); \ allocAndInitHipDeviceData(b, m_b, iend); @@ -71,7 +65,8 @@ __global__ void dot(Real_ptr a, Real_ptr b, } -void DOT::runHipVariant(VariantID vid) +template < size_t block_size > +void DOT::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -135,6 +130,15 @@ void DOT::runHipVariant(VariantID vid) } } +void DOT::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n DOT : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp index cca4aae4a..874e16477 100644 --- a/src/stream/DOT.cpp +++ b/src/stream/DOT.cpp @@ -21,6 +21,10 @@ namespace stream DOT::DOT(const RunParams& params) : KernelBase(rajaperf::Stream_DOT, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(2000); @@ -29,7 +33,7 @@ DOT::DOT(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) + - (0*sizeof(Real_type) + 2*sizeof(Real_type)) * + (0*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() ); setFLOPsPerRep(2 * getActualProblemSize()); @@ -79,5 +83,11 @@ void DOT::tearDown(VariantID vid) deallocData(m_b); } +bool DOT::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp index adb9309c4..98a1f3c37 100644 --- a/src/stream/DOT.hpp +++ b/src/stream/DOT.hpp @@ -52,7 +52,16 @@ class DOT : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_a; Real_ptr m_b; Real_type m_dot; From 8f44c4df73bdf6ebbbb0bfd828ae999e62c3941c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 21 Oct 2021 14:22:31 -0700 Subject: [PATCH 139/392] Template DOT and HEAT_3D kernels on block_size This allows us to insert constants into the code and use launch bounds --- src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp | 36 ++++++++++++++++-------- src/polybench/POLYBENCH_HEAT_3D-Hip.cpp | 36 +++++++++++++++--------- src/stream/DOT-Cuda.cpp | 14 ++++----- src/stream/DOT-Hip.cpp | 14 +++++---- 4 files changed, 62 insertions(+), 38 deletions(-) diff --git a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp index 1e30f098e..a5954a04e 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp @@ -28,8 +28,11 @@ namespace polybench #define j_block_sz (block_size / k_block_sz) #define i_block_sz (1) +#define HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \ + k_block_sz, j_block_sz, i_block_sz + #define HEAT_3D_THREADS_PER_BLOCK_CUDA \ - dim3 nthreads_per_block(k_block_sz, j_block_sz, i_block_sz); + dim3 nthreads_per_block(HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA); #define HEAT_3D_NBLOCKS_CUDA \ dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(N-2, k_block_sz)), \ @@ -50,34 +53,39 @@ namespace polybench deallocCudaDeviceData(B); +template < size_t k_block_size, size_t j_block_size, size_t i_block_size > +__launch_bounds__(k_block_size*j_block_size*i_block_size) __global__ void poly_heat_3D_1(Real_ptr A, Real_ptr B, Index_type N) { Index_type i = 1 + blockIdx.z; - Index_type j = 1 + blockIdx.y * blockDim.y + threadIdx.y; - Index_type k = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type j = 1 + blockIdx.y * j_block_size + threadIdx.y; + Index_type k = 1 + blockIdx.x * k_block_size + threadIdx.x; if (i < N-1 && j < N-1 && k < N-1) { POLYBENCH_HEAT_3D_BODY1; } } +template < size_t k_block_size, size_t j_block_size, size_t i_block_size > +__launch_bounds__(k_block_size*j_block_size*i_block_size) __global__ void poly_heat_3D_2(Real_ptr A, Real_ptr B, Index_type N) { Index_type i = 1 + blockIdx.z; - Index_type j = 1 + blockIdx.y * blockDim.y + threadIdx.y; - Index_type k = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type j = 1 + blockIdx.y * j_block_size + threadIdx.y; + Index_type k = 1 + blockIdx.x * k_block_size + threadIdx.x; if (i < N-1 && j < N-1 && k < N-1) { POLYBENCH_HEAT_3D_BODY2; } } -template< typename Lambda > +template< size_t k_block_size, size_t j_block_size, size_t i_block_size, typename Lambda > +__launch_bounds__(k_block_size*j_block_size*i_block_size) __global__ void poly_heat_3D_lam(Index_type N, Lambda body) { Index_type i = 1 + blockIdx.z; - Index_type j = 1 + blockIdx.y * blockDim.y + threadIdx.y; - Index_type k = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type j = 1 + blockIdx.y * j_block_size + threadIdx.y; + Index_type k = 1 + blockIdx.x * k_block_size + threadIdx.x; if (i < N-1 && j < N-1 && k < N-1) { body(i, j, k); @@ -104,10 +112,12 @@ void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid) HEAT_3D_THREADS_PER_BLOCK_CUDA; HEAT_3D_NBLOCKS_CUDA; - poly_heat_3D_1<<>>(A, B, N); + poly_heat_3D_1 + <<>>(A, B, N); cudaErrchk( cudaGetLastError() ); - poly_heat_3D_2<<>>(A, B, N); + poly_heat_3D_2 + <<>>(A, B, N); cudaErrchk( cudaGetLastError() ); } @@ -129,14 +139,16 @@ void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid) HEAT_3D_THREADS_PER_BLOCK_CUDA; HEAT_3D_NBLOCKS_CUDA; - poly_heat_3D_lam<<>>(N, + poly_heat_3D_lam + <<>>(N, [=] __device__ (Index_type i, Index_type j, Index_type k) { POLYBENCH_HEAT_3D_BODY1; } ); cudaErrchk( cudaGetLastError() ); - poly_heat_3D_lam<<>>(N, + poly_heat_3D_lam + <<>>(N, [=] __device__ (Index_type i, Index_type j, Index_type k) { POLYBENCH_HEAT_3D_BODY2; } diff --git a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp index 2bee7df49..804ced7b2 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp @@ -28,8 +28,11 @@ namespace polybench #define j_block_sz (block_size / k_block_sz) #define i_block_sz (1) +#define HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \ + k_block_sz, j_block_sz, i_block_sz + #define HEAT_3D_THREADS_PER_BLOCK_HIP \ - dim3 nthreads_per_block(k_block_sz, j_block_sz, i_block_sz); + dim3 nthreads_per_block(HEAT_3D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP); #define HEAT_3D_NBLOCKS_HIP \ dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(N-2, k_block_sz)), \ @@ -50,34 +53,39 @@ namespace polybench deallocHipDeviceData(B); +template < size_t k_block_size, size_t j_block_size, size_t i_block_size > +__launch_bounds__(k_block_size*j_block_size*i_block_size) __global__ void poly_heat_3D_1(Real_ptr A, Real_ptr B, Index_type N) { Index_type i = 1 + blockIdx.z; - Index_type j = 1 + blockIdx.y * blockDim.y + threadIdx.y; - Index_type k = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type j = 1 + blockIdx.y * j_block_size + threadIdx.y; + Index_type k = 1 + blockIdx.x * k_block_size + threadIdx.x; if (i < N-1 && j < N-1 && k < N-1) { POLYBENCH_HEAT_3D_BODY1; } } +template < size_t k_block_size, size_t j_block_size, size_t i_block_size > +__launch_bounds__(k_block_size*j_block_size*i_block_size) __global__ void poly_heat_3D_2(Real_ptr A, Real_ptr B, Index_type N) { Index_type i = 1 + blockIdx.z; - Index_type j = 1 + blockIdx.y * blockDim.y + threadIdx.y; - Index_type k = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type j = 1 + blockIdx.y * j_block_size + threadIdx.y; + Index_type k = 1 + blockIdx.x * k_block_size + threadIdx.x; if (i < N-1 && j < N-1 && k < N-1) { POLYBENCH_HEAT_3D_BODY2; } } -template< typename Lambda > +template< size_t k_block_size, size_t j_block_size, size_t i_block_size, typename Lambda > +__launch_bounds__(k_block_size*j_block_size*i_block_size) __global__ void poly_heat_3D_lam(Index_type N, Lambda body) { Index_type i = 1 + blockIdx.z; - Index_type j = 1 + blockIdx.y * blockDim.y + threadIdx.y; - Index_type k = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type j = 1 + blockIdx.y * j_block_size + threadIdx.y; + Index_type k = 1 + blockIdx.x * k_block_size + threadIdx.x; if (i < N-1 && j < N-1 && k < N-1) { body(i, j, k); @@ -104,13 +112,13 @@ void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid) HEAT_3D_THREADS_PER_BLOCK_HIP; HEAT_3D_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_heat_3D_1), + hipLaunchKernelGGL((poly_heat_3D_1), dim3(nblocks), dim3(nthreads_per_block), 0, 0, A, B, N); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_heat_3D_2), - dim3(nblocks), dim3(nthreads_per_block), 0, 0, + hipLaunchKernelGGL((poly_heat_3D_2), + dim3(nblocks), dim3(nthreads_per_block), 0, 0, A, B, N); hipErrchk( hipGetLastError() ); @@ -142,12 +150,14 @@ void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid) POLYBENCH_HEAT_3D_BODY2; }; - hipLaunchKernelGGL((poly_heat_3D_lam), + hipLaunchKernelGGL((poly_heat_3D_lam), dim3(nblocks), dim3(nthreads_per_block), 0, 0, N, poly_heat_3D_1_lambda); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_heat_3D_lam), + hipLaunchKernelGGL((poly_heat_3D_lam), dim3(nblocks), dim3(nthreads_per_block), 0, 0, N, poly_heat_3D_2_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index 5d0b071bc..bdf27aa81 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -30,21 +30,23 @@ namespace stream deallocCudaDeviceData(a); \ deallocCudaDeviceData(b); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void dot(Real_ptr a, Real_ptr b, Real_ptr dprod, Real_type dprod_init, Index_type iend) { extern __shared__ Real_type pdot[ ]; - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; pdot[ threadIdx.x ] = dprod_init; - for ( ; i < iend ; i += gridDim.x * blockDim.x ) { + for ( ; i < iend ; i += gridDim.x * block_size ) { pdot[ threadIdx.x ] += a[ i ] * b[i]; } __syncthreads(); - for ( i = blockDim.x / 2; i > 0; i /= 2 ) { + for ( i = block_size / 2; i > 0; i /= 2 ) { if ( threadIdx.x < i ) { pdot[ threadIdx.x ] += pdot[ threadIdx.x + i ]; } @@ -86,10 +88,8 @@ void DOT::runCudaVariantImpl(VariantID vid) initCudaDeviceData(dprod, &m_dot_init, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - dot<<>>( a, b, - dprod, m_dot_init, - iend ); + dot<<>>( + a, b, dprod, m_dot_init, iend ); cudaErrchk( cudaGetLastError() ); Real_type lprod; diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index ec2827a05..7d5621ef6 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -30,21 +30,23 @@ namespace stream deallocHipDeviceData(a); \ deallocHipDeviceData(b); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void dot(Real_ptr a, Real_ptr b, Real_ptr dprod, Real_type dprod_init, Index_type iend) { HIP_DYNAMIC_SHARED( Real_type, pdot) - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; pdot[ threadIdx.x ] = dprod_init; - for ( ; i < iend ; i += gridDim.x * blockDim.x ) { + for ( ; i < iend ; i += gridDim.x * block_size ) { pdot[ threadIdx.x ] += a[ i ] * b[i]; } __syncthreads(); - for ( i = blockDim.x / 2; i > 0; i /= 2 ) { + for ( i = block_size / 2; i > 0; i /= 2 ) { if ( threadIdx.x < i ) { pdot[ threadIdx.x ] += pdot[ threadIdx.x + i ]; } @@ -87,9 +89,9 @@ void DOT::runHipVariantImpl(VariantID vid) initHipDeviceData(dprod, &m_dot_init, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((dot), dim3(grid_size), dim3(block_size), sizeof(Real_type)*block_size, 0, a, b, - dprod, m_dot_init, - iend ); + hipLaunchKernelGGL((dot), dim3(grid_size), dim3(block_size), + sizeof(Real_type)*block_size, 0, + a, b, dprod, m_dot_init, iend ); hipErrchk( hipGetLastError() ); Real_type lprod; From df8475c66886b6ef853b628857c166052fe325c1 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 21 Oct 2021 15:41:29 -0700 Subject: [PATCH 140/392] Add block_size support to apps kernels --- src/apps/DEL_DOT_VEC_2D-Cuda.cpp | 18 ++++++++----- src/apps/DEL_DOT_VEC_2D-Hip.cpp | 18 ++++++++----- src/apps/DEL_DOT_VEC_2D.cpp | 10 +++++++ src/apps/DEL_DOT_VEC_2D.hpp | 9 +++++++ src/apps/ENERGY-Cuda.cpp | 18 ++++++++----- src/apps/ENERGY-Hip.cpp | 18 ++++++++----- src/apps/ENERGY.cpp | 10 +++++++ src/apps/ENERGY.hpp | 9 +++++++ src/apps/FIR-Cuda.cpp | 18 ++++++++----- src/apps/FIR-Hip.cpp | 18 ++++++++----- src/apps/FIR.cpp | 12 ++++++++- src/apps/FIR.hpp | 9 +++++++ src/apps/HALOEXCHANGE-Cuda.cpp | 18 ++++++++----- src/apps/HALOEXCHANGE-Hip.cpp | 18 ++++++++----- src/apps/HALOEXCHANGE.cpp | 10 +++++++ src/apps/HALOEXCHANGE.hpp | 9 +++++++ src/apps/HALOEXCHANGE_FUSED-Cuda.cpp | 18 ++++++++----- src/apps/HALOEXCHANGE_FUSED-Hip.cpp | 18 ++++++++----- src/apps/HALOEXCHANGE_FUSED.cpp | 10 +++++++ src/apps/HALOEXCHANGE_FUSED.hpp | 9 +++++++ src/apps/LTIMES-Cuda.cpp | 39 ++++++++++++++++++++-------- src/apps/LTIMES-Hip.cpp | 32 +++++++++++++++++------ src/apps/LTIMES.cpp | 14 ++++++++-- src/apps/LTIMES.hpp | 9 +++++++ src/apps/LTIMES_NOVIEW-Cuda.cpp | 26 +++++++++++++++---- src/apps/LTIMES_NOVIEW-Hip.cpp | 38 +++++++++++++++++++-------- src/apps/LTIMES_NOVIEW.cpp | 12 ++++++++- src/apps/LTIMES_NOVIEW.hpp | 9 +++++++ src/apps/PRESSURE-Cuda.cpp | 18 ++++++++----- src/apps/PRESSURE-Hip.cpp | 18 ++++++++----- src/apps/PRESSURE.cpp | 10 +++++++ src/apps/PRESSURE.hpp | 9 +++++++ src/apps/VOL3D-Cuda.cpp | 18 ++++++++----- src/apps/VOL3D-Hip.cpp | 18 ++++++++----- src/apps/VOL3D.cpp | 10 +++++++ src/apps/VOL3D.hpp | 9 +++++++ 36 files changed, 429 insertions(+), 137 deletions(-) diff --git a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp index 82973db44..10eb6a50c 100644 --- a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp @@ -25,12 +25,6 @@ namespace rajaperf namespace apps { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define DEL_DOT_VEC_2D_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(x, m_x, m_array_length); \ allocAndInitCudaDeviceData(y, m_y, m_array_length); \ @@ -69,7 +63,8 @@ __global__ void deldotvec2d(Real_ptr div, } -void DEL_DOT_VEC_2D::runCudaVariant(VariantID vid) +template < size_t block_size > +void DEL_DOT_VEC_2D::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type iend = m_domain->n_real_zones; @@ -165,6 +160,15 @@ void DEL_DOT_VEC_2D::runCudaVariant(VariantID vid) } } +void DEL_DOT_VEC_2D::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n DEL_DOT_VEC_2D : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/DEL_DOT_VEC_2D-Hip.cpp b/src/apps/DEL_DOT_VEC_2D-Hip.cpp index ba97858c8..98f19e771 100644 --- a/src/apps/DEL_DOT_VEC_2D-Hip.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Hip.cpp @@ -25,12 +25,6 @@ namespace rajaperf namespace apps { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define DEL_DOT_VEC_2D_DATA_SETUP_HIP \ allocAndInitHipDeviceData(x, m_x, m_array_length); \ allocAndInitHipDeviceData(y, m_y, m_array_length); \ @@ -69,7 +63,8 @@ __global__ void deldotvec2d(Real_ptr div, } -void DEL_DOT_VEC_2D::runHipVariant(VariantID vid) +template < size_t block_size > +void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type iend = m_domain->n_real_zones; @@ -167,6 +162,15 @@ void DEL_DOT_VEC_2D::runHipVariant(VariantID vid) } } +void DEL_DOT_VEC_2D::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n DEL_DOT_VEC_2D : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index cec0af410..992482b06 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -25,6 +25,10 @@ namespace apps DEL_DOT_VEC_2D::DEL_DOT_VEC_2D(const RunParams& params) : KernelBase(rajaperf::Apps_DEL_DOT_VEC_2D, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000*1000); // See rzmax in ADomain struct setDefaultReps(100); @@ -103,5 +107,11 @@ void DEL_DOT_VEC_2D::tearDown(VariantID vid) deallocData(m_div); } +bool DEL_DOT_VEC_2D::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/DEL_DOT_VEC_2D.hpp b/src/apps/DEL_DOT_VEC_2D.hpp index 1a4d7670b..5b155de54 100644 --- a/src/apps/DEL_DOT_VEC_2D.hpp +++ b/src/apps/DEL_DOT_VEC_2D.hpp @@ -114,7 +114,16 @@ class DEL_DOT_VEC_2D : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_x; Real_ptr m_y; Real_ptr m_xdot; diff --git a/src/apps/ENERGY-Cuda.cpp b/src/apps/ENERGY-Cuda.cpp index e8a97b99c..f1fce56d6 100644 --- a/src/apps/ENERGY-Cuda.cpp +++ b/src/apps/ENERGY-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace apps { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define ENERGY_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(e_new, m_e_new, iend); \ allocAndInitCudaDeviceData(e_old, m_e_old, iend); \ @@ -137,7 +131,8 @@ __global__ void energycalc6(Real_ptr delvc, } -void ENERGY::runCudaVariant(VariantID vid) +template < size_t block_size > +void ENERGY::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -261,6 +256,15 @@ void ENERGY::runCudaVariant(VariantID vid) } } +void ENERGY::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n ENERGY : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/ENERGY-Hip.cpp b/src/apps/ENERGY-Hip.cpp index 96a1c759a..db658f844 100644 --- a/src/apps/ENERGY-Hip.cpp +++ b/src/apps/ENERGY-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace apps { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define ENERGY_DATA_SETUP_HIP \ allocAndInitHipDeviceData(e_new, m_e_new, iend); \ allocAndInitHipDeviceData(e_old, m_e_old, iend); \ @@ -137,7 +131,8 @@ __global__ void energycalc6(Real_ptr delvc, } -void ENERGY::runHipVariant(VariantID vid) +template < size_t block_size > +void ENERGY::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -255,6 +250,15 @@ void ENERGY::runHipVariant(VariantID vid) } } +void ENERGY::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n ENERGY : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp index a6a779f8c..82ba4918e 100644 --- a/src/apps/ENERGY.cpp +++ b/src/apps/ENERGY.cpp @@ -21,6 +21,10 @@ namespace apps ENERGY::ENERGY(const RunParams& params) : KernelBase(rajaperf::Apps_ENERGY, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(130); @@ -119,5 +123,11 @@ void ENERGY::tearDown(VariantID vid) deallocData(m_vnewc); } +bool ENERGY::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/ENERGY.hpp b/src/apps/ENERGY.hpp index 00a45de1d..d9443e4a1 100644 --- a/src/apps/ENERGY.hpp +++ b/src/apps/ENERGY.hpp @@ -204,7 +204,16 @@ class ENERGY : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_e_new; Real_ptr m_e_old; Real_ptr m_delvc; diff --git a/src/apps/FIR-Cuda.cpp b/src/apps/FIR-Cuda.cpp index baff493ac..c455dc76d 100644 --- a/src/apps/FIR-Cuda.cpp +++ b/src/apps/FIR-Cuda.cpp @@ -25,12 +25,6 @@ namespace apps #define USE_CUDA_CONSTANT_MEMORY //#undef USE_CUDA_CONSTANT_MEMORY - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #if defined(USE_CUDA_CONSTANT_MEMORY) __constant__ Real_type coeff[FIR_COEFFLEN]; @@ -87,7 +81,8 @@ __global__ void fir(Real_ptr out, Real_ptr in, #endif -void FIR::runCudaVariant(VariantID vid) +template < size_t block_size > +void FIR::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -148,6 +143,15 @@ void FIR::runCudaVariant(VariantID vid) } } +void FIR::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n FIR : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/FIR-Hip.cpp b/src/apps/FIR-Hip.cpp index 4408a714c..f92cc47c2 100644 --- a/src/apps/FIR-Hip.cpp +++ b/src/apps/FIR-Hip.cpp @@ -25,12 +25,6 @@ namespace apps #define USE_HIP_CONSTANT_MEMORY // #undef USE_HIP_CONSTANT_MEMORY - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #if defined(USE_HIP_CONSTANT_MEMORY) __constant__ Real_type coeff[FIR_COEFFLEN]; @@ -87,7 +81,8 @@ __global__ void fir(Real_ptr out, Real_ptr in, #endif -void FIR::runHipVariant(VariantID vid) +template < size_t block_size > +void FIR::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -148,6 +143,15 @@ void FIR::runHipVariant(VariantID vid) } } +void FIR::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n FIR : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp index fe3993cd9..f7976b0f0 100644 --- a/src/apps/FIR.cpp +++ b/src/apps/FIR.cpp @@ -21,6 +21,10 @@ namespace apps FIR::FIR(const RunParams& params) : KernelBase(rajaperf::Apps_FIR, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(160); @@ -33,7 +37,7 @@ FIR::FIR(const RunParams& params) setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getItsPerRep() + (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); setFLOPsPerRep((2 * m_coefflen) * (getActualProblemSize() - m_coefflen)); - + checksum_scale_factor = 0.0001 * ( static_cast(getDefaultProblemSize()) / getActualProblemSize() ); @@ -81,5 +85,11 @@ void FIR::tearDown(VariantID vid) deallocData(m_out); } +bool FIR::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/FIR.hpp b/src/apps/FIR.hpp index e9b49edcb..3b798c6a9 100644 --- a/src/apps/FIR.hpp +++ b/src/apps/FIR.hpp @@ -79,7 +79,16 @@ class FIR : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_in; Real_ptr m_out; diff --git a/src/apps/HALOEXCHANGE-Cuda.cpp b/src/apps/HALOEXCHANGE-Cuda.cpp index 4633b4a7f..d8eb1f28d 100644 --- a/src/apps/HALOEXCHANGE-Cuda.cpp +++ b/src/apps/HALOEXCHANGE-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace apps { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define HALOEXCHANGE_DATA_SETUP_CUDA \ for (Index_type v = 0; v < m_num_vars; ++v) { \ allocAndInitCudaDeviceData(vars[v], m_vars[v], m_var_size); \ @@ -69,7 +63,8 @@ __global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, } -void HALOEXCHANGE::runCudaVariant(VariantID vid) +template < size_t block_size > +void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -170,6 +165,15 @@ void HALOEXCHANGE::runCudaVariant(VariantID vid) } } +void HALOEXCHANGE::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n HALOEXCHANGE : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE-Hip.cpp b/src/apps/HALOEXCHANGE-Hip.cpp index fd6fac040..d2f2b8214 100644 --- a/src/apps/HALOEXCHANGE-Hip.cpp +++ b/src/apps/HALOEXCHANGE-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace apps { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define HALOEXCHANGE_DATA_SETUP_HIP \ for (Index_type v = 0; v < m_num_vars; ++v) { \ allocAndInitHipDeviceData(vars[v], m_vars[v], m_var_size); \ @@ -69,7 +63,8 @@ __global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, } -void HALOEXCHANGE::runHipVariant(VariantID vid) +template < size_t block_size > +void HALOEXCHANGE::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -172,6 +167,15 @@ void HALOEXCHANGE::runHipVariant(VariantID vid) } } +void HALOEXCHANGE::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n HALOEXCHANGE : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE.cpp b/src/apps/HALOEXCHANGE.cpp index db7c7bb90..55109ad65 100644 --- a/src/apps/HALOEXCHANGE.cpp +++ b/src/apps/HALOEXCHANGE.cpp @@ -44,6 +44,10 @@ void destroy_unpack_lists(std::vector& unpack_index_lists, HALOEXCHANGE::HALOEXCHANGE(const RunParams& params) : KernelBase(rajaperf::Apps_HALOEXCHANGE, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + m_grid_dims_default[0] = 100; m_grid_dims_default[1] = 100; m_grid_dims_default[2] = 100; @@ -160,6 +164,12 @@ void HALOEXCHANGE::tearDown(VariantID vid) m_vars.clear(); } +bool HALOEXCHANGE::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + namespace { struct Extent diff --git a/src/apps/HALOEXCHANGE.hpp b/src/apps/HALOEXCHANGE.hpp index d10bd4790..8d833344c 100644 --- a/src/apps/HALOEXCHANGE.hpp +++ b/src/apps/HALOEXCHANGE.hpp @@ -94,7 +94,16 @@ class HALOEXCHANGE : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + static const int s_num_neighbors = 26; Index_type m_grid_dims[3]; diff --git a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp index 114ed61ba..3311c04bc 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace apps { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 1024; - - #define HALOEXCHANGE_FUSED_DATA_SETUP_CUDA \ for (Index_type v = 0; v < m_num_vars; ++v) { \ allocAndInitCudaDeviceData(vars[v], m_vars[v], m_var_size); \ @@ -112,7 +106,8 @@ __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* } -void HALOEXCHANGE_FUSED::runCudaVariant(VariantID vid) +template < size_t block_size > +void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -271,6 +266,15 @@ void HALOEXCHANGE_FUSED::runCudaVariant(VariantID vid) } } +void HALOEXCHANGE_FUSED::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n HALOEXCHANGE_FUSED : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp index 1288f9429..bf9bb469a 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace apps { - // - // Define thread block size for HIP execution - // - const size_t block_size = 1024; - - #define HALOEXCHANGE_FUSED_DATA_SETUP_HIP \ for (Index_type v = 0; v < m_num_vars; ++v) { \ allocAndInitHipDeviceData(vars[v], m_vars[v], m_var_size); \ @@ -111,7 +105,8 @@ __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* } -void HALOEXCHANGE_FUSED::runHipVariant(VariantID vid) +template < size_t block_size > +void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -274,6 +269,15 @@ void HALOEXCHANGE_FUSED::runHipVariant(VariantID vid) } } +void HALOEXCHANGE_FUSED::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n HALOEXCHANGE_FUSED : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE_FUSED.cpp b/src/apps/HALOEXCHANGE_FUSED.cpp index 6882cf51e..71ad45b49 100644 --- a/src/apps/HALOEXCHANGE_FUSED.cpp +++ b/src/apps/HALOEXCHANGE_FUSED.cpp @@ -44,6 +44,10 @@ void destroy_unpack_lists(std::vector& unpack_index_lists, HALOEXCHANGE_FUSED::HALOEXCHANGE_FUSED(const RunParams& params) : KernelBase(rajaperf::Apps_HALOEXCHANGE_FUSED, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + m_grid_dims_default[0] = 100; m_grid_dims_default[1] = 100; m_grid_dims_default[2] = 100; @@ -160,6 +164,12 @@ void HALOEXCHANGE_FUSED::tearDown(VariantID vid) m_vars.clear(); } +bool HALOEXCHANGE_FUSED::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + namespace { struct Extent diff --git a/src/apps/HALOEXCHANGE_FUSED.hpp b/src/apps/HALOEXCHANGE_FUSED.hpp index 68e81da59..5eba35edf 100644 --- a/src/apps/HALOEXCHANGE_FUSED.hpp +++ b/src/apps/HALOEXCHANGE_FUSED.hpp @@ -138,7 +138,16 @@ class HALOEXCHANGE_FUSED : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 1024; + using gpu_block_sizes_type = gpu_block_size::list_type; + static const int s_num_neighbors = 26; Index_type m_grid_dims[3]; diff --git a/src/apps/LTIMES-Cuda.cpp b/src/apps/LTIMES-Cuda.cpp index f1f47b5a2..feb78865a 100644 --- a/src/apps/LTIMES-Cuda.cpp +++ b/src/apps/LTIMES-Cuda.cpp @@ -22,14 +22,21 @@ namespace apps { // -// Define thread block size for CUDA execution +// Define thread block shape for CUDA execution // -constexpr size_t z_block_sz = 2; -constexpr size_t g_block_sz = 4; -constexpr size_t m_block_sz = 32; +#define m_block_sz (32) +// Note that z_block_sz = 2 is done for expedience, but +// ideally we would find g_block_sz, z_block_sz +// whole number factors of block_size / m_block_sz where +// g_block_sz * z_block_sz == block_size / m_block_sz, +// g_block_sz >= z_block_sz, and +// g_block_sz - z_block_sz is minimized +#define z_block_sz (2) +#define g_block_sz (block_size / m_block_sz / z_block_sz) #define LTIMES_THREADS_PER_BLOCK_CUDA \ - dim3 nthreads_per_block(m_block_sz, g_block_sz, z_block_sz); + dim3 nthreads_per_block(m_block_sz, g_block_sz, z_block_sz); \ + static_assert(m_block_sz*g_block_sz*z_block_sz == block_size, "Invalid block_size"); #define LTIMES_NBLOCKS_CUDA \ dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(num_m, m_block_sz)), \ @@ -49,14 +56,14 @@ constexpr size_t m_block_sz = 32; deallocCudaDeviceData(psidat); __global__ void ltimes(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat, - Index_type num_d, + Index_type num_d, Index_type num_m, Index_type num_g, Index_type num_z) { Index_type m = blockIdx.x * blockDim.x + threadIdx.x; Index_type g = blockIdx.y * blockDim.y + threadIdx.y; Index_type z = blockIdx.z * blockDim.z + threadIdx.z; - if (m < num_m && g < num_g && z < num_z) { + if (m < num_m && g < num_g && z < num_z) { for (Index_type d = 0; d < num_d; ++d ) { LTIMES_BODY; } @@ -77,7 +84,8 @@ __global__ void ltimes_lam(Index_type num_m, Index_type num_g, Index_type num_z, } -void LTIMES::runCudaVariant(VariantID vid) +template < size_t block_size > +void LTIMES::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -94,7 +102,7 @@ void LTIMES::runCudaVariant(VariantID vid) LTIMES_NBLOCKS_CUDA; ltimes<<>>(phidat, elldat, psidat, - num_d, + num_d, num_m, num_g, num_z); cudaErrchk( cudaGetLastError() ); @@ -139,9 +147,9 @@ void LTIMES::runCudaVariant(VariantID vid) RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_z_direct, RAJA::statement::Tile<2, RAJA::tile_fixed, - RAJA::cuda_block_y_direct, + RAJA::cuda_block_y_direct, RAJA::statement::Tile<3, RAJA::tile_fixed, - RAJA::cuda_block_x_direct, + RAJA::cuda_block_x_direct, RAJA::statement::For<1, RAJA::cuda_thread_z_direct, //z RAJA::statement::For<2, RAJA::cuda_thread_y_direct, //g RAJA::statement::For<3, RAJA::cuda_thread_x_direct, //m @@ -178,6 +186,15 @@ void LTIMES::runCudaVariant(VariantID vid) } } +void LTIMES::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n LTIMES : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/LTIMES-Hip.cpp b/src/apps/LTIMES-Hip.cpp index 4d28aa028..be1fe60e1 100644 --- a/src/apps/LTIMES-Hip.cpp +++ b/src/apps/LTIMES-Hip.cpp @@ -22,11 +22,17 @@ namespace apps { // -// Define thread block size for Hip execution +// Define thread block shape for Hip execution // -constexpr size_t z_block_sz = 2; -constexpr size_t g_block_sz = 4; -constexpr size_t m_block_sz = 32; +#define m_block_sz (32) +// Note that z_block_sz = 2 is done for expedience, but +// ideally we would find g_block_sz, z_block_sz +// whole number factors of block_size / m_block_sz where +// g_block_sz * z_block_sz == block_size / m_block_sz, +// g_block_sz >= z_block_sz, and +// g_block_sz - z_block_sz is minimized +#define z_block_sz (2) +#define g_block_sz (block_size / m_block_sz / z_block_sz) #define LTIMES_THREADS_PER_BLOCK_HIP \ dim3 nthreads_per_block(m_block_sz, g_block_sz, z_block_sz); @@ -77,7 +83,8 @@ __global__ void ltimes_lam(Index_type num_m, Index_type num_g, Index_type num_z, } -void LTIMES::runHipVariant(VariantID vid) +template < size_t block_size > +void LTIMES::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -93,10 +100,10 @@ void LTIMES::runHipVariant(VariantID vid) LTIMES_THREADS_PER_BLOCK_HIP; LTIMES_NBLOCKS_HIP; - hipLaunchKernelGGL((ltimes), + hipLaunchKernelGGL((ltimes), dim3(nblocks), dim3(nthreads_per_block), 0, 0, phidat, elldat, psidat, - num_d, + num_d, num_m, num_g, num_z); hipErrchk( hipGetLastError() ); @@ -115,7 +122,7 @@ void LTIMES::runHipVariant(VariantID vid) LTIMES_THREADS_PER_BLOCK_HIP; LTIMES_NBLOCKS_HIP; - auto ltimes_lambda = + auto ltimes_lambda = [=] __device__ (Index_type z, Index_type g, Index_type m) { for (Index_type d = 0; d < num_d; ++d ) { LTIMES_BODY; @@ -183,6 +190,15 @@ void LTIMES::runHipVariant(VariantID vid) } } +void LTIMES::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n LTIMES : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp index c69a2b300..886c7f24e 100644 --- a/src/apps/LTIMES.cpp +++ b/src/apps/LTIMES.cpp @@ -23,6 +23,10 @@ namespace apps LTIMES::LTIMES(const RunParams& params) : KernelBase(rajaperf::Apps_LTIMES, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + m_num_d_default = 64; m_num_z_default = 488; m_num_g_default = 32; @@ -31,7 +35,7 @@ LTIMES::LTIMES(const RunParams& params) setDefaultProblemSize(m_num_d_default * m_num_g_default * m_num_z_default); setDefaultReps(50); - m_num_z = std::max( getTargetProblemSize() / + m_num_z = std::max( getTargetProblemSize() / (m_num_d_default * m_num_g_default), Index_type(1) ); m_num_g = m_num_g_default; @@ -54,7 +58,7 @@ LTIMES::LTIMES(const RunParams& params) checksum_scale_factor = 0.001 * ( static_cast(getDefaultProblemSize()) / - getActualProblemSize() ); + getActualProblemSize() ); setUsesFeature(Kernel); setUsesFeature(View); @@ -104,5 +108,11 @@ void LTIMES::tearDown(VariantID vid) deallocData(m_psidat); } +bool LTIMES::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/LTIMES.hpp b/src/apps/LTIMES.hpp index 6177873be..c640dfe2d 100644 --- a/src/apps/LTIMES.hpp +++ b/src/apps/LTIMES.hpp @@ -117,7 +117,16 @@ class LTIMES : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_phidat; Real_ptr m_elldat; Real_ptr m_psidat; diff --git a/src/apps/LTIMES_NOVIEW-Cuda.cpp b/src/apps/LTIMES_NOVIEW-Cuda.cpp index 1a7403ece..2760e10cf 100644 --- a/src/apps/LTIMES_NOVIEW-Cuda.cpp +++ b/src/apps/LTIMES_NOVIEW-Cuda.cpp @@ -22,11 +22,17 @@ namespace apps { // -// Define thread block size for CUDA execution +// Define thread block shape for CUDA execution // -constexpr size_t z_block_sz = 2; -constexpr size_t g_block_sz = 4; -constexpr size_t m_block_sz = 32; +#define m_block_sz (32) +// Note that z_block_sz = 2 is done for expedience, but +// ideally we would find g_block_sz, z_block_sz +// whole number factors of block_size / m_block_sz where +// g_block_sz * z_block_sz == block_size / m_block_sz, +// g_block_sz >= z_block_sz, and +// g_block_sz - z_block_sz is minimized +#define z_block_sz (2) +#define g_block_sz (block_size / m_block_sz / z_block_sz) #define LTIMES_NOVIEW_THREADS_PER_BLOCK_CUDA \ dim3 nthreads_per_block(m_block_sz, g_block_sz, z_block_sz); @@ -78,7 +84,8 @@ __global__ void ltimes_noview_lam(Index_type num_m, Index_type num_g, Index_type } -void LTIMES_NOVIEW::runCudaVariant(VariantID vid) +template < size_t block_size > +void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -177,6 +184,15 @@ void LTIMES_NOVIEW::runCudaVariant(VariantID vid) } } +void LTIMES_NOVIEW::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n LTIMES_NOVIEW : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/LTIMES_NOVIEW-Hip.cpp b/src/apps/LTIMES_NOVIEW-Hip.cpp index 8f36737d8..f934af3b3 100644 --- a/src/apps/LTIMES_NOVIEW-Hip.cpp +++ b/src/apps/LTIMES_NOVIEW-Hip.cpp @@ -22,11 +22,17 @@ namespace apps { // -// Define thread block size for Hip execution +// Define thread block shape for Hip execution // -constexpr size_t z_block_sz = 2; -constexpr size_t g_block_sz = 4; -constexpr size_t m_block_sz = 32; +#define m_block_sz (32) +// Note that z_block_sz = 2 is done for expedience, but +// ideally we would find g_block_sz, z_block_sz +// whole number factors of block_size / m_block_sz where +// g_block_sz * z_block_sz == block_size / m_block_sz, +// g_block_sz >= z_block_sz, and +// g_block_sz - z_block_sz is minimized +#define z_block_sz (2) +#define g_block_sz (block_size / m_block_sz / z_block_sz) #define LTIMES_NOVIEW_THREADS_PER_BLOCK_HIP \ dim3 nthreads_per_block(m_block_sz, g_block_sz, z_block_sz); @@ -77,7 +83,8 @@ __global__ void ltimes_noview_lam(Index_type num_m, Index_type num_g, Index_type } -void LTIMES_NOVIEW::runHipVariant(VariantID vid) +template < size_t block_size > +void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -93,10 +100,10 @@ void LTIMES_NOVIEW::runHipVariant(VariantID vid) LTIMES_NOVIEW_THREADS_PER_BLOCK_HIP; LTIMES_NOVIEW_NBLOCKS_HIP; - hipLaunchKernelGGL((ltimes_noview), - dim3(nblocks), dim3(nthreads_per_block), 0, 0, + hipLaunchKernelGGL((ltimes_noview), + dim3(nblocks), dim3(nthreads_per_block), 0, 0, phidat, elldat, psidat, - num_d, + num_d, num_m, num_g, num_z); hipErrchk( hipGetLastError() ); @@ -115,15 +122,15 @@ void LTIMES_NOVIEW::runHipVariant(VariantID vid) LTIMES_NOVIEW_THREADS_PER_BLOCK_HIP; LTIMES_NOVIEW_NBLOCKS_HIP; - auto ltimes_noview_lambda = + auto ltimes_noview_lambda = [=] __device__ (Index_type z, Index_type g, Index_type m) { for (Index_type d = 0; d < num_d; ++d ) { LTIMES_NOVIEW_BODY; } }; - hipLaunchKernelGGL((ltimes_noview_lam), - dim3(nblocks), dim3(nthreads_per_block), 0, 0, + hipLaunchKernelGGL((ltimes_noview_lam), + dim3(nblocks), dim3(nthreads_per_block), 0, 0, num_m, num_g, num_z, ltimes_noview_lambda); hipErrchk( hipGetLastError() ); @@ -182,6 +189,15 @@ void LTIMES_NOVIEW::runHipVariant(VariantID vid) } } +void LTIMES_NOVIEW::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n LTIMES_NOVIEW : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp index 5d341cea3..da8d7e9e6 100644 --- a/src/apps/LTIMES_NOVIEW.cpp +++ b/src/apps/LTIMES_NOVIEW.cpp @@ -23,6 +23,10 @@ namespace apps LTIMES_NOVIEW::LTIMES_NOVIEW(const RunParams& params) : KernelBase(rajaperf::Apps_LTIMES_NOVIEW, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + m_num_d_default = 64; m_num_z_default = 488; m_num_g_default = 32; @@ -31,7 +35,7 @@ LTIMES_NOVIEW::LTIMES_NOVIEW(const RunParams& params) setDefaultProblemSize(m_num_d_default * m_num_g_default * m_num_z_default); setDefaultReps(50); - m_num_z = std::max( getTargetProblemSize() / + m_num_z = std::max( getTargetProblemSize() / (m_num_d_default * m_num_g_default), Index_type(1) ); m_num_g = m_num_g_default; @@ -103,5 +107,11 @@ void LTIMES_NOVIEW::tearDown(VariantID vid) deallocData(m_psidat); } +bool LTIMES_NOVIEW::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/LTIMES_NOVIEW.hpp b/src/apps/LTIMES_NOVIEW.hpp index 24c524ecc..7565a2640 100644 --- a/src/apps/LTIMES_NOVIEW.hpp +++ b/src/apps/LTIMES_NOVIEW.hpp @@ -67,7 +67,16 @@ class LTIMES_NOVIEW : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_phidat; Real_ptr m_elldat; Real_ptr m_psidat; diff --git a/src/apps/PRESSURE-Cuda.cpp b/src/apps/PRESSURE-Cuda.cpp index b0d5ab615..c22f07f00 100644 --- a/src/apps/PRESSURE-Cuda.cpp +++ b/src/apps/PRESSURE-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace apps { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define PRESSURE_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(compression, m_compression, iend); \ allocAndInitCudaDeviceData(bvc, m_bvc, iend); \ @@ -65,7 +59,8 @@ __global__ void pressurecalc2(Real_ptr p_new, Real_ptr bvc, Real_ptr e_old, } -void PRESSURE::runCudaVariant(VariantID vid) +template < size_t block_size > +void PRESSURE::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -137,6 +132,15 @@ void PRESSURE::runCudaVariant(VariantID vid) } } +void PRESSURE::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n PRESSURE : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/PRESSURE-Hip.cpp b/src/apps/PRESSURE-Hip.cpp index 646fbc703..cdf62422f 100644 --- a/src/apps/PRESSURE-Hip.cpp +++ b/src/apps/PRESSURE-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace apps { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define PRESSURE_DATA_SETUP_HIP \ allocAndInitHipDeviceData(compression, m_compression, iend); \ allocAndInitHipDeviceData(bvc, m_bvc, iend); \ @@ -65,7 +59,8 @@ __global__ void pressurecalc2(Real_ptr p_new, Real_ptr bvc, Real_ptr e_old, } -void PRESSURE::runHipVariant(VariantID vid) +template < size_t block_size > +void PRESSURE::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -130,6 +125,15 @@ void PRESSURE::runHipVariant(VariantID vid) } } +void PRESSURE::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n PRESSURE : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp index b4ef1d72c..6793fb7fb 100644 --- a/src/apps/PRESSURE.cpp +++ b/src/apps/PRESSURE.cpp @@ -21,6 +21,10 @@ namespace apps PRESSURE::PRESSURE(const RunParams& params) : KernelBase(rajaperf::Apps_PRESSURE, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(700); @@ -88,5 +92,11 @@ void PRESSURE::tearDown(VariantID vid) deallocData(m_vnewc); } +bool PRESSURE::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/PRESSURE.hpp b/src/apps/PRESSURE.hpp index 44c6602fa..032ec70ab 100644 --- a/src/apps/PRESSURE.hpp +++ b/src/apps/PRESSURE.hpp @@ -73,7 +73,16 @@ class PRESSURE : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_compression; Real_ptr m_bvc; Real_ptr m_p_new; diff --git a/src/apps/VOL3D-Cuda.cpp b/src/apps/VOL3D-Cuda.cpp index 79db31282..b6e83ff53 100644 --- a/src/apps/VOL3D-Cuda.cpp +++ b/src/apps/VOL3D-Cuda.cpp @@ -23,12 +23,6 @@ namespace rajaperf namespace apps { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define VOL3D_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(x, m_x, m_array_length); \ allocAndInitCudaDeviceData(y, m_y, m_array_length); \ @@ -66,7 +60,8 @@ __global__ void vol3d(Real_ptr vol, } -void VOL3D::runCudaVariant(VariantID vid) +template < size_t block_size > +void VOL3D::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = m_domain->fpz; @@ -126,6 +121,15 @@ void VOL3D::runCudaVariant(VariantID vid) } } +void VOL3D::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n VOL3D : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/VOL3D-Hip.cpp b/src/apps/VOL3D-Hip.cpp index 978c794ce..47c9495a7 100644 --- a/src/apps/VOL3D-Hip.cpp +++ b/src/apps/VOL3D-Hip.cpp @@ -23,12 +23,6 @@ namespace rajaperf namespace apps { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define VOL3D_DATA_SETUP_HIP \ allocAndInitHipDeviceData(x, m_x, m_array_length); \ allocAndInitHipDeviceData(y, m_y, m_array_length); \ @@ -66,7 +60,8 @@ __global__ void vol3d(Real_ptr vol, } -void VOL3D::runHipVariant(VariantID vid) +template < size_t block_size > +void VOL3D::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = m_domain->fpz; @@ -126,6 +121,15 @@ void VOL3D::runHipVariant(VariantID vid) } } +void VOL3D::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n VOL3D : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp index a8ac3bbc6..df532ef70 100644 --- a/src/apps/VOL3D.cpp +++ b/src/apps/VOL3D.cpp @@ -25,6 +25,10 @@ namespace apps VOL3D::VOL3D(const RunParams& params) : KernelBase(rajaperf::Apps_VOL3D, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(100*100*100); // See rzmax in ADomain struct setDefaultReps(100); @@ -102,5 +106,11 @@ void VOL3D::tearDown(VariantID vid) deallocData(m_vol); } +bool VOL3D::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/VOL3D.hpp b/src/apps/VOL3D.hpp index 6faf02523..dae03eb77 100644 --- a/src/apps/VOL3D.hpp +++ b/src/apps/VOL3D.hpp @@ -170,7 +170,16 @@ class VOL3D : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_x; Real_ptr m_y; Real_ptr m_z; From afb54bbe29d52f1574523796ad6faa6bf5e5fc0d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 21 Oct 2021 15:41:50 -0700 Subject: [PATCH 141/392] Template apps gpu kernels on block_size --- src/apps/DEL_DOT_VEC_2D-Cuda.cpp | 8 ++++--- src/apps/DEL_DOT_VEC_2D-Hip.cpp | 8 ++++--- src/apps/ENERGY-Cuda.cpp | 36 ++++++++++++++++++---------- src/apps/ENERGY-Hip.cpp | 36 ++++++++++++++++++---------- src/apps/FIR-Cuda.cpp | 12 ++++++---- src/apps/FIR-Hip.cpp | 12 ++++++---- src/apps/HALOEXCHANGE-Cuda.cpp | 12 ++++++---- src/apps/HALOEXCHANGE-Hip.cpp | 12 ++++++---- src/apps/HALOEXCHANGE_FUSED-Cuda.cpp | 16 ++++++++----- src/apps/HALOEXCHANGE_FUSED-Hip.cpp | 16 ++++++++----- src/apps/LTIMES-Cuda.cpp | 28 ++++++++++++++-------- src/apps/LTIMES-Hip.cpp | 26 ++++++++++++-------- src/apps/LTIMES_NOVIEW-Cuda.cpp | 28 ++++++++++++++-------- src/apps/LTIMES_NOVIEW-Hip.cpp | 26 ++++++++++++-------- src/apps/PRESSURE-Cuda.cpp | 12 ++++++---- src/apps/PRESSURE-Hip.cpp | 12 ++++++---- src/apps/VOL3D-Cuda.cpp | 6 +++-- src/apps/VOL3D-Hip.cpp | 6 +++-- 18 files changed, 202 insertions(+), 110 deletions(-) diff --git a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp index 10eb6a50c..36db2a9b8 100644 --- a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp @@ -42,6 +42,8 @@ namespace apps deallocCudaDeviceData(div); \ deallocCudaDeviceData(real_zones); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void deldotvec2d(Real_ptr div, const Real_ptr x1, const Real_ptr x2, const Real_ptr x3, const Real_ptr x4, @@ -55,7 +57,7 @@ __global__ void deldotvec2d(Real_ptr div, const Real_type half, const Real_type ptiny, Index_type iend) { - Index_type ii = blockIdx.x * blockDim.x + threadIdx.x; + Index_type ii = blockIdx.x * block_size + threadIdx.x; if (ii < iend) { DEL_DOT_VEC_2D_BODY_INDEX; DEL_DOT_VEC_2D_BODY; @@ -85,7 +87,7 @@ void DEL_DOT_VEC_2D::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - deldotvec2d<<>>(div, + deldotvec2d<<>>(div, x1, x2, x3, x4, y1, y2, y3, y4, fx1, fx2, fx3, fx4, @@ -114,7 +116,7 @@ void DEL_DOT_VEC_2D::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - lambda_cuda_forall<<>>( + lambda_cuda_forall<<>>( 0, iend, [=] __device__ (Index_type ii) { diff --git a/src/apps/DEL_DOT_VEC_2D-Hip.cpp b/src/apps/DEL_DOT_VEC_2D-Hip.cpp index 98f19e771..af36cbd43 100644 --- a/src/apps/DEL_DOT_VEC_2D-Hip.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Hip.cpp @@ -42,6 +42,8 @@ namespace apps deallocHipDeviceData(div); \ deallocHipDeviceData(real_zones); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void deldotvec2d(Real_ptr div, const Real_ptr x1, const Real_ptr x2, const Real_ptr x3, const Real_ptr x4, @@ -55,7 +57,7 @@ __global__ void deldotvec2d(Real_ptr div, const Real_type half, const Real_type ptiny, Index_type iend) { - Index_type ii = blockIdx.x * blockDim.x + threadIdx.x; + Index_type ii = blockIdx.x * block_size + threadIdx.x; if (ii < iend) { DEL_DOT_VEC_2D_BODY_INDEX; DEL_DOT_VEC_2D_BODY; @@ -85,7 +87,7 @@ void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((deldotvec2d), dim3(grid_size), dim3(block_size), 0, 0, div, + hipLaunchKernelGGL((deldotvec2d), dim3(grid_size), dim3(block_size), 0, 0, div, x1, x2, x3, x4, y1, y2, y3, y4, fx1, fx2, fx3, fx4, @@ -120,7 +122,7 @@ void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL(lambda_hip_forall, + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, 0, iend, deldotvec2d_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/apps/ENERGY-Cuda.cpp b/src/apps/ENERGY-Cuda.cpp index f1fce56d6..de83860e4 100644 --- a/src/apps/ENERGY-Cuda.cpp +++ b/src/apps/ENERGY-Cuda.cpp @@ -57,16 +57,20 @@ namespace apps deallocCudaDeviceData(qq_old); \ deallocCudaDeviceData(vnewc); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void energycalc1(Real_ptr e_new, Real_ptr e_old, Real_ptr delvc, Real_ptr p_old, Real_ptr q_old, Real_ptr work, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { ENERGY_BODY1; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void energycalc2(Real_ptr delvc, Real_ptr q_new, Real_ptr compHalfStep, Real_ptr pHalfStep, Real_ptr e_new, Real_ptr bvc, Real_ptr pbvc, @@ -74,33 +78,39 @@ __global__ void energycalc2(Real_ptr delvc, Real_ptr q_new, Real_type rho0, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { ENERGY_BODY2; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void energycalc3(Real_ptr e_new, Real_ptr delvc, Real_ptr p_old, Real_ptr q_old, Real_ptr pHalfStep, Real_ptr q_new, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { ENERGY_BODY3; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void energycalc4(Real_ptr e_new, Real_ptr work, Real_type e_cut, Real_type emin, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { ENERGY_BODY4; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void energycalc5(Real_ptr delvc, Real_ptr pbvc, Real_ptr e_new, Real_ptr vnewc, Real_ptr bvc, Real_ptr p_new, @@ -110,12 +120,14 @@ __global__ void energycalc5(Real_ptr delvc, Real_type rho0, Real_type e_cut, Real_type emin, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { ENERGY_BODY5; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void energycalc6(Real_ptr delvc, Real_ptr pbvc, Real_ptr e_new, Real_ptr vnewc, Real_ptr bvc, Real_ptr p_new, @@ -124,7 +136,7 @@ __global__ void energycalc6(Real_ptr delvc, Real_type rho0, Real_type q_cut, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { ENERGY_BODY6; } @@ -149,12 +161,12 @@ void ENERGY::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - energycalc1<<>>( e_new, e_old, delvc, + energycalc1<<>>( e_new, e_old, delvc, p_old, q_old, work, iend ); cudaErrchk( cudaGetLastError() ); - energycalc2<<>>( delvc, q_new, + energycalc2<<>>( delvc, q_new, compHalfStep, pHalfStep, e_new, bvc, pbvc, ql_old, qq_old, @@ -162,18 +174,18 @@ void ENERGY::runCudaVariantImpl(VariantID vid) iend ); cudaErrchk( cudaGetLastError() ); - energycalc3<<>>( e_new, delvc, + energycalc3<<>>( e_new, delvc, p_old, q_old, pHalfStep, q_new, iend ); cudaErrchk( cudaGetLastError() ); - energycalc4<<>>( e_new, work, + energycalc4<<>>( e_new, work, e_cut, emin, iend ); cudaErrchk( cudaGetLastError() ); - energycalc5<<>>( delvc, + energycalc5<<>>( delvc, pbvc, e_new, vnewc, bvc, p_new, ql_old, qq_old, @@ -183,7 +195,7 @@ void ENERGY::runCudaVariantImpl(VariantID vid) iend ); cudaErrchk( cudaGetLastError() ); - energycalc6<<>>( delvc, + energycalc6<<>>( delvc, pbvc, e_new, vnewc, bvc, p_new, q_new, diff --git a/src/apps/ENERGY-Hip.cpp b/src/apps/ENERGY-Hip.cpp index db658f844..9cd95310b 100644 --- a/src/apps/ENERGY-Hip.cpp +++ b/src/apps/ENERGY-Hip.cpp @@ -57,16 +57,20 @@ namespace apps deallocHipDeviceData(qq_old); \ deallocHipDeviceData(vnewc); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void energycalc1(Real_ptr e_new, Real_ptr e_old, Real_ptr delvc, Real_ptr p_old, Real_ptr q_old, Real_ptr work, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { ENERGY_BODY1; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void energycalc2(Real_ptr delvc, Real_ptr q_new, Real_ptr compHalfStep, Real_ptr pHalfStep, Real_ptr e_new, Real_ptr bvc, Real_ptr pbvc, @@ -74,33 +78,39 @@ __global__ void energycalc2(Real_ptr delvc, Real_ptr q_new, Real_type rho0, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { ENERGY_BODY2; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void energycalc3(Real_ptr e_new, Real_ptr delvc, Real_ptr p_old, Real_ptr q_old, Real_ptr pHalfStep, Real_ptr q_new, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { ENERGY_BODY3; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void energycalc4(Real_ptr e_new, Real_ptr work, Real_type e_cut, Real_type emin, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { ENERGY_BODY4; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void energycalc5(Real_ptr delvc, Real_ptr pbvc, Real_ptr e_new, Real_ptr vnewc, Real_ptr bvc, Real_ptr p_new, @@ -110,12 +120,14 @@ __global__ void energycalc5(Real_ptr delvc, Real_type rho0, Real_type e_cut, Real_type emin, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { ENERGY_BODY5; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void energycalc6(Real_ptr delvc, Real_ptr pbvc, Real_ptr e_new, Real_ptr vnewc, Real_ptr bvc, Real_ptr p_new, @@ -124,7 +136,7 @@ __global__ void energycalc6(Real_ptr delvc, Real_type rho0, Real_type q_cut, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { ENERGY_BODY6; } @@ -149,12 +161,12 @@ void ENERGY::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((energycalc1), dim3(grid_size), dim3(block_size), 0, 0, e_new, e_old, delvc, + hipLaunchKernelGGL((energycalc1), dim3(grid_size), dim3(block_size), 0, 0, e_new, e_old, delvc, p_old, q_old, work, iend ); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((energycalc2), dim3(grid_size), dim3(block_size), 0, 0, delvc, q_new, + hipLaunchKernelGGL((energycalc2), dim3(grid_size), dim3(block_size), 0, 0, delvc, q_new, compHalfStep, pHalfStep, e_new, bvc, pbvc, ql_old, qq_old, @@ -162,18 +174,18 @@ void ENERGY::runHipVariantImpl(VariantID vid) iend ); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((energycalc3), dim3(grid_size), dim3(block_size), 0, 0, e_new, delvc, + hipLaunchKernelGGL((energycalc3), dim3(grid_size), dim3(block_size), 0, 0, e_new, delvc, p_old, q_old, pHalfStep, q_new, iend ); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((energycalc4), dim3(grid_size), dim3(block_size), 0, 0, e_new, work, + hipLaunchKernelGGL((energycalc4), dim3(grid_size), dim3(block_size), 0, 0, e_new, work, e_cut, emin, iend ); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((energycalc5), dim3(grid_size), dim3(block_size), 0, 0, delvc, + hipLaunchKernelGGL((energycalc5), dim3(grid_size), dim3(block_size), 0, 0, delvc, pbvc, e_new, vnewc, bvc, p_new, ql_old, qq_old, @@ -183,7 +195,7 @@ void ENERGY::runHipVariantImpl(VariantID vid) iend ); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((energycalc6), dim3(grid_size), dim3(block_size), 0, 0, delvc, + hipLaunchKernelGGL((energycalc6), dim3(grid_size), dim3(block_size), 0, 0, delvc, pbvc, e_new, vnewc, bvc, p_new, q_new, diff --git a/src/apps/FIR-Cuda.cpp b/src/apps/FIR-Cuda.cpp index c455dc76d..615504c22 100644 --- a/src/apps/FIR-Cuda.cpp +++ b/src/apps/FIR-Cuda.cpp @@ -40,11 +40,13 @@ __constant__ Real_type coeff[FIR_COEFFLEN]; deallocCudaDeviceData(in); \ deallocCudaDeviceData(out); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void fir(Real_ptr out, Real_ptr in, const Index_type coefflen, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { FIR_BODY; } @@ -67,12 +69,14 @@ __global__ void fir(Real_ptr out, Real_ptr in, deallocCudaDeviceData(out); \ deallocCudaDeviceData(coeff); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void fir(Real_ptr out, Real_ptr in, Real_ptr coeff, const Index_type coefflen, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { FIR_BODY; } @@ -102,12 +106,12 @@ void FIR::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); #if defined(USE_CUDA_CONSTANT_MEMORY) - fir<<>>( out, in, + fir<<>>( out, in, coefflen, iend ); cudaErrchk( cudaGetLastError() ); #else - fir<<>>( out, in, + fir<<>>( out, in, coeff, coefflen, iend ); diff --git a/src/apps/FIR-Hip.cpp b/src/apps/FIR-Hip.cpp index f92cc47c2..672fde092 100644 --- a/src/apps/FIR-Hip.cpp +++ b/src/apps/FIR-Hip.cpp @@ -40,11 +40,13 @@ __constant__ Real_type coeff[FIR_COEFFLEN]; deallocHipDeviceData(in); \ deallocHipDeviceData(out); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void fir(Real_ptr out, Real_ptr in, const Index_type coefflen, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { FIR_BODY; } @@ -67,12 +69,14 @@ __global__ void fir(Real_ptr out, Real_ptr in, deallocHipDeviceData(out); \ deallocHipDeviceData(coeff); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void fir(Real_ptr out, Real_ptr in, Real_ptr coeff, const Index_type coefflen, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { FIR_BODY; } @@ -102,12 +106,12 @@ void FIR::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); #if defined(USE_HIP_CONSTANT_MEMORY) - hipLaunchKernelGGL((fir), dim3(grid_size), dim3(block_size), 0, 0, out, in, + hipLaunchKernelGGL((fir), dim3(grid_size), dim3(block_size), 0, 0, out, in, coefflen, iend ); hipErrchk( hipGetLastError() ); #else - hipLaunchKernelGGL((fir), dim3(grid_size), dim3(block_size), 0, 0, out, in, + hipLaunchKernelGGL((fir), dim3(grid_size), dim3(block_size), 0, 0, out, in, coeff, coefflen, iend ); diff --git a/src/apps/HALOEXCHANGE-Cuda.cpp b/src/apps/HALOEXCHANGE-Cuda.cpp index d8eb1f28d..28ae8397f 100644 --- a/src/apps/HALOEXCHANGE-Cuda.cpp +++ b/src/apps/HALOEXCHANGE-Cuda.cpp @@ -42,20 +42,24 @@ namespace apps deallocCudaDeviceData(vars[v]); \ } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void haloexchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { - Index_type i = threadIdx.x + blockIdx.x * blockDim.x; + Index_type i = threadIdx.x + blockIdx.x * block_size; if (i < len) { HALOEXCHANGE_PACK_BODY; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { - Index_type i = threadIdx.x + blockIdx.x * blockDim.x; + Index_type i = threadIdx.x + blockIdx.x * block_size; if (i < len) { HALOEXCHANGE_UNPACK_BODY; @@ -85,7 +89,7 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); - haloexchange_pack<<>>(buffer, list, var, len); + haloexchange_pack<<>>(buffer, list, var, len); cudaErrchk( cudaGetLastError() ); buffer += len; } @@ -100,7 +104,7 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); - haloexchange_unpack<<>>(buffer, list, var, len); + haloexchange_unpack<<>>(buffer, list, var, len); cudaErrchk( cudaGetLastError() ); buffer += len; } diff --git a/src/apps/HALOEXCHANGE-Hip.cpp b/src/apps/HALOEXCHANGE-Hip.cpp index d2f2b8214..f644021f1 100644 --- a/src/apps/HALOEXCHANGE-Hip.cpp +++ b/src/apps/HALOEXCHANGE-Hip.cpp @@ -42,20 +42,24 @@ namespace apps deallocHipDeviceData(vars[v]); \ } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void haloexchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { - Index_type i = threadIdx.x + blockIdx.x * blockDim.x; + Index_type i = threadIdx.x + blockIdx.x * block_size; if (i < len) { HALOEXCHANGE_PACK_BODY; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { - Index_type i = threadIdx.x + blockIdx.x * blockDim.x; + Index_type i = threadIdx.x + blockIdx.x * block_size; if (i < len) { HALOEXCHANGE_UNPACK_BODY; @@ -85,7 +89,7 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); - hipLaunchKernelGGL((haloexchange_pack), nblocks, nthreads_per_block, 0, 0, + hipLaunchKernelGGL((haloexchange_pack), nblocks, nthreads_per_block, 0, 0, buffer, list, var, len); hipErrchk( hipGetLastError() ); buffer += len; @@ -101,7 +105,7 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); - hipLaunchKernelGGL((haloexchange_unpack), nblocks, nthreads_per_block, 0, 0, + hipLaunchKernelGGL((haloexchange_unpack), nblocks, nthreads_per_block, 0, 0, buffer, list, var, len); hipErrchk( hipGetLastError() ); buffer += len; diff --git a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp index 3311c04bc..a802640c5 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp @@ -71,6 +71,8 @@ namespace apps deallocCudaPinnedData(unpack_var_ptrs); \ deallocCudaPinnedData(unpack_len_ptrs); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs) { @@ -81,13 +83,15 @@ __global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pac Real_ptr var = pack_var_ptrs[j]; Index_type len = pack_len_ptrs[j]; - for (Index_type i = threadIdx.x + blockIdx.x * blockDim.x; + for (Index_type i = threadIdx.x + blockIdx.x * block_size; i < len; - i += blockDim.x * gridDim.x) { + i += block_size * gridDim.x) { HALOEXCHANGE_FUSED_PACK_BODY; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs) { @@ -98,9 +102,9 @@ __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* Real_ptr var = unpack_var_ptrs[j]; Index_type len = unpack_len_ptrs[j]; - for (Index_type i = threadIdx.x + blockIdx.x * blockDim.x; + for (Index_type i = threadIdx.x + blockIdx.x * block_size; i < len; - i += blockDim.x * gridDim.x) { + i += block_size * gridDim.x) { HALOEXCHANGE_FUSED_UNPACK_BODY; } } @@ -143,7 +147,7 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; dim3 pack_nthreads_per_block(block_size); dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index); - haloexchange_fused_pack<<>>( + haloexchange_fused_pack<<>>( pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs); cudaErrchk( cudaGetLastError() ); synchronize(); @@ -169,7 +173,7 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; dim3 unpack_nthreads_per_block(block_size); dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index); - haloexchange_fused_unpack<<>>( + haloexchange_fused_unpack<<>>( unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs); cudaErrchk( cudaGetLastError() ); synchronize(); diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp index bf9bb469a..c8c57f844 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp @@ -70,6 +70,8 @@ namespace apps deallocHipPinnedData(unpack_var_ptrs); \ deallocHipPinnedData(unpack_len_ptrs); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs) { @@ -80,13 +82,15 @@ __global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pac Real_ptr var = pack_var_ptrs[j]; Index_type len = pack_len_ptrs[j]; - for (Index_type i = threadIdx.x + blockIdx.x * blockDim.x; + for (Index_type i = threadIdx.x + blockIdx.x * block_size; i < len; - i += blockDim.x * gridDim.x) { + i += block_size * gridDim.x) { HALOEXCHANGE_FUSED_PACK_BODY; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs) { @@ -97,9 +101,9 @@ __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* Real_ptr var = unpack_var_ptrs[j]; Index_type len = unpack_len_ptrs[j]; - for (Index_type i = threadIdx.x + blockIdx.x * blockDim.x; + for (Index_type i = threadIdx.x + blockIdx.x * block_size; i < len; - i += blockDim.x * gridDim.x) { + i += block_size * gridDim.x) { HALOEXCHANGE_FUSED_UNPACK_BODY; } } @@ -142,7 +146,7 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; dim3 pack_nthreads_per_block(block_size); dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index); - hipLaunchKernelGGL((haloexchange_fused_pack), pack_nblocks, pack_nthreads_per_block, 0, 0, + hipLaunchKernelGGL((haloexchange_fused_pack), pack_nblocks, pack_nthreads_per_block, 0, 0, pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs); hipErrchk( hipGetLastError() ); synchronize(); @@ -168,7 +172,7 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; dim3 unpack_nthreads_per_block(block_size); dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index); - hipLaunchKernelGGL((haloexchange_fused_unpack), unpack_nblocks, unpack_nthreads_per_block, 0, 0, + hipLaunchKernelGGL((haloexchange_fused_unpack), unpack_nblocks, unpack_nthreads_per_block, 0, 0, unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs); hipErrchk( hipGetLastError() ); synchronize(); diff --git a/src/apps/LTIMES-Cuda.cpp b/src/apps/LTIMES-Cuda.cpp index feb78865a..1feedb880 100644 --- a/src/apps/LTIMES-Cuda.cpp +++ b/src/apps/LTIMES-Cuda.cpp @@ -34,8 +34,11 @@ namespace apps #define z_block_sz (2) #define g_block_sz (block_size / m_block_sz / z_block_sz) +#define LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \ + m_block_sz, g_block_sz, z_block_sz + #define LTIMES_THREADS_PER_BLOCK_CUDA \ - dim3 nthreads_per_block(m_block_sz, g_block_sz, z_block_sz); \ + dim3 nthreads_per_block(LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA); \ static_assert(m_block_sz*g_block_sz*z_block_sz == block_size, "Invalid block_size"); #define LTIMES_NBLOCKS_CUDA \ @@ -55,13 +58,15 @@ namespace apps deallocCudaDeviceData(elldat); \ deallocCudaDeviceData(psidat); +template < size_t m_block_size, size_t g_block_size, size_t z_block_size > +__launch_bounds__(m_block_size*g_block_size*z_block_size) __global__ void ltimes(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat, Index_type num_d, Index_type num_m, Index_type num_g, Index_type num_z) { - Index_type m = blockIdx.x * blockDim.x + threadIdx.x; - Index_type g = blockIdx.y * blockDim.y + threadIdx.y; - Index_type z = blockIdx.z * blockDim.z + threadIdx.z; + Index_type m = blockIdx.x * m_block_size + threadIdx.x; + Index_type g = blockIdx.y * g_block_size + threadIdx.y; + Index_type z = blockIdx.z * z_block_size + threadIdx.z; if (m < num_m && g < num_g && z < num_z) { for (Index_type d = 0; d < num_d; ++d ) { @@ -70,13 +75,14 @@ __global__ void ltimes(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat, } } -template< typename Lambda > +template < size_t m_block_size, size_t g_block_size, size_t z_block_size, typename Lambda > +__launch_bounds__(m_block_size*g_block_size*z_block_size) __global__ void ltimes_lam(Index_type num_m, Index_type num_g, Index_type num_z, Lambda body) { - Index_type m = blockIdx.x * blockDim.x + threadIdx.x; - Index_type g = blockIdx.y * blockDim.y + threadIdx.y; - Index_type z = blockIdx.z * blockDim.z + threadIdx.z; + Index_type m = blockIdx.x * m_block_size + threadIdx.x; + Index_type g = blockIdx.y * g_block_size + threadIdx.y; + Index_type z = blockIdx.z * z_block_size + threadIdx.z; if (m < num_m && g < num_g && z < num_z) { body(z, g, m); @@ -101,7 +107,8 @@ void LTIMES::runCudaVariantImpl(VariantID vid) LTIMES_THREADS_PER_BLOCK_CUDA; LTIMES_NBLOCKS_CUDA; - ltimes<<>>(phidat, elldat, psidat, + ltimes + <<>>(phidat, elldat, psidat, num_d, num_m, num_g, num_z); cudaErrchk( cudaGetLastError() ); @@ -121,7 +128,8 @@ void LTIMES::runCudaVariantImpl(VariantID vid) LTIMES_THREADS_PER_BLOCK_CUDA; LTIMES_NBLOCKS_CUDA; - ltimes_lam<<>>(num_m, num_g, num_z, + ltimes_lam + <<>>(num_m, num_g, num_z, [=] __device__ (Index_type z, Index_type g, Index_type m) { for (Index_type d = 0; d < num_d; ++d ) { LTIMES_BODY; diff --git a/src/apps/LTIMES-Hip.cpp b/src/apps/LTIMES-Hip.cpp index be1fe60e1..fe2e184fd 100644 --- a/src/apps/LTIMES-Hip.cpp +++ b/src/apps/LTIMES-Hip.cpp @@ -34,8 +34,11 @@ namespace apps #define z_block_sz (2) #define g_block_sz (block_size / m_block_sz / z_block_sz) +#define LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \ + m_block_sz, g_block_sz, z_block_sz + #define LTIMES_THREADS_PER_BLOCK_HIP \ - dim3 nthreads_per_block(m_block_sz, g_block_sz, z_block_sz); + dim3 nthreads_per_block(LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP); #define LTIMES_NBLOCKS_HIP \ dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(num_m, m_block_sz)), \ @@ -54,13 +57,15 @@ namespace apps deallocHipDeviceData(elldat); \ deallocHipDeviceData(psidat); +template < size_t m_block_size, size_t g_block_size, size_t z_block_size > +__launch_bounds__(m_block_size*g_block_size*z_block_size) __global__ void ltimes(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat, Index_type num_d, Index_type num_m, Index_type num_g, Index_type num_z) { - Index_type m = blockIdx.x * blockDim.x + threadIdx.x; - Index_type g = blockIdx.y * blockDim.y + threadIdx.y; - Index_type z = blockIdx.z * blockDim.z + threadIdx.z; + Index_type m = blockIdx.x * m_block_size + threadIdx.x; + Index_type g = blockIdx.y * g_block_size + threadIdx.y; + Index_type z = blockIdx.z * z_block_size + threadIdx.z; if (m < num_m && g < num_g && z < num_z) { for (Index_type d = 0; d < num_d; ++d ) { @@ -69,13 +74,14 @@ __global__ void ltimes(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat, } } -template< typename Lambda > +template < size_t m_block_size, size_t g_block_size, size_t z_block_size, typename Lambda > +__launch_bounds__(m_block_size*g_block_size*z_block_size) __global__ void ltimes_lam(Index_type num_m, Index_type num_g, Index_type num_z, Lambda body) { - Index_type m = blockIdx.x * blockDim.x + threadIdx.x; - Index_type g = blockIdx.y * blockDim.y + threadIdx.y; - Index_type z = blockIdx.z * blockDim.z + threadIdx.z; + Index_type m = blockIdx.x * m_block_size + threadIdx.x; + Index_type g = blockIdx.y * g_block_size + threadIdx.y; + Index_type z = blockIdx.z * z_block_size + threadIdx.z; if (m < num_m && g < num_g && z < num_z) { body(z, g, m); @@ -100,7 +106,7 @@ void LTIMES::runHipVariantImpl(VariantID vid) LTIMES_THREADS_PER_BLOCK_HIP; LTIMES_NBLOCKS_HIP; - hipLaunchKernelGGL((ltimes), + hipLaunchKernelGGL((ltimes), dim3(nblocks), dim3(nthreads_per_block), 0, 0, phidat, elldat, psidat, num_d, @@ -129,7 +135,7 @@ void LTIMES::runHipVariantImpl(VariantID vid) } }; - hipLaunchKernelGGL((ltimes_lam), + hipLaunchKernelGGL((ltimes_lam), dim3(nblocks), dim3(nthreads_per_block), 0, 0, num_m, num_g, num_z, ltimes_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/apps/LTIMES_NOVIEW-Cuda.cpp b/src/apps/LTIMES_NOVIEW-Cuda.cpp index 2760e10cf..0253c58a8 100644 --- a/src/apps/LTIMES_NOVIEW-Cuda.cpp +++ b/src/apps/LTIMES_NOVIEW-Cuda.cpp @@ -34,8 +34,11 @@ namespace apps #define z_block_sz (2) #define g_block_sz (block_size / m_block_sz / z_block_sz) +#define LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \ + m_block_sz, g_block_sz, z_block_sz + #define LTIMES_NOVIEW_THREADS_PER_BLOCK_CUDA \ - dim3 nthreads_per_block(m_block_sz, g_block_sz, z_block_sz); + dim3 nthreads_per_block(LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA); #define LTIMES_NOVIEW_NBLOCKS_CUDA \ dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(num_m, m_block_sz)), \ @@ -55,13 +58,15 @@ namespace apps deallocCudaDeviceData(elldat); \ deallocCudaDeviceData(psidat); +template < size_t m_block_size, size_t g_block_size, size_t z_block_size > +__launch_bounds__(m_block_size*g_block_size*z_block_size) __global__ void ltimes_noview(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat, Index_type num_d, Index_type num_m, Index_type num_g, Index_type num_z) { - Index_type m = blockIdx.x * blockDim.x + threadIdx.x; - Index_type g = blockIdx.y * blockDim.y + threadIdx.y; - Index_type z = blockIdx.z * blockDim.z + threadIdx.z; + Index_type m = blockIdx.x * m_block_size + threadIdx.x; + Index_type g = blockIdx.y * g_block_size + threadIdx.y; + Index_type z = blockIdx.z * z_block_size + threadIdx.z; if (m < num_m && g < num_g && z < num_z) { for (Index_type d = 0; d < num_d; ++d ) { @@ -70,13 +75,14 @@ __global__ void ltimes_noview(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat, } } -template< typename Lambda > +template < size_t m_block_size, size_t g_block_size, size_t z_block_size, typename Lambda > +__launch_bounds__(m_block_size*g_block_size*z_block_size) __global__ void ltimes_noview_lam(Index_type num_m, Index_type num_g, Index_type num_z, Lambda body) { - Index_type m = blockIdx.x * blockDim.x + threadIdx.x; - Index_type g = blockIdx.y * blockDim.y + threadIdx.y; - Index_type z = blockIdx.z * blockDim.z + threadIdx.z; + Index_type m = blockIdx.x * m_block_size + threadIdx.x; + Index_type g = blockIdx.y * g_block_size + threadIdx.y; + Index_type z = blockIdx.z * z_block_size + threadIdx.z; if (m < num_m && g < num_g && z < num_z) { body(z, g, m); @@ -101,7 +107,8 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid) LTIMES_NOVIEW_THREADS_PER_BLOCK_CUDA; LTIMES_NOVIEW_NBLOCKS_CUDA; - ltimes_noview<<>>(phidat, elldat, psidat, + ltimes_noview + <<>>(phidat, elldat, psidat, num_d, num_m, num_g, num_z); cudaErrchk( cudaGetLastError() ); @@ -121,7 +128,8 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid) LTIMES_NOVIEW_THREADS_PER_BLOCK_CUDA; LTIMES_NOVIEW_NBLOCKS_CUDA; - ltimes_noview_lam<<>>(num_m, num_g, num_z, + ltimes_noview_lam + <<>>(num_m, num_g, num_z, [=] __device__ (Index_type z, Index_type g, Index_type m) { for (Index_type d = 0; d < num_d; ++d ) { LTIMES_NOVIEW_BODY; diff --git a/src/apps/LTIMES_NOVIEW-Hip.cpp b/src/apps/LTIMES_NOVIEW-Hip.cpp index f934af3b3..7061ff224 100644 --- a/src/apps/LTIMES_NOVIEW-Hip.cpp +++ b/src/apps/LTIMES_NOVIEW-Hip.cpp @@ -34,8 +34,11 @@ namespace apps #define z_block_sz (2) #define g_block_sz (block_size / m_block_sz / z_block_sz) +#define LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \ + m_block_sz, g_block_sz, z_block_sz + #define LTIMES_NOVIEW_THREADS_PER_BLOCK_HIP \ - dim3 nthreads_per_block(m_block_sz, g_block_sz, z_block_sz); + dim3 nthreads_per_block(LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP); #define LTIMES_NOVIEW_NBLOCKS_HIP \ dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(num_m, m_block_sz)), \ @@ -54,13 +57,15 @@ namespace apps deallocHipDeviceData(elldat); \ deallocHipDeviceData(psidat); +template < size_t m_block_size, size_t g_block_size, size_t z_block_size > +__launch_bounds__(m_block_size*g_block_size*z_block_size) __global__ void ltimes_noview(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat, Index_type num_d, Index_type num_m, Index_type num_g, Index_type num_z) { - Index_type m = blockIdx.x * blockDim.x + threadIdx.x; - Index_type g = blockIdx.y * blockDim.y + threadIdx.y; - Index_type z = blockIdx.z * blockDim.z + threadIdx.z; + Index_type m = blockIdx.x * m_block_size + threadIdx.x; + Index_type g = blockIdx.y * g_block_size + threadIdx.y; + Index_type z = blockIdx.z * z_block_size + threadIdx.z; if (m < num_m && g < num_g && z < num_z) { for (Index_type d = 0; d < num_d; ++d ) { @@ -69,13 +74,14 @@ __global__ void ltimes_noview(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat, } } -template< typename Lambda > +template < size_t m_block_size, size_t g_block_size, size_t z_block_size, typename Lambda > +__launch_bounds__(m_block_size*g_block_size*z_block_size) __global__ void ltimes_noview_lam(Index_type num_m, Index_type num_g, Index_type num_z, Lambda body) { - Index_type m = blockIdx.x * blockDim.x + threadIdx.x; - Index_type g = blockIdx.y * blockDim.y + threadIdx.y; - Index_type z = blockIdx.z * blockDim.z + threadIdx.z; + Index_type m = blockIdx.x * m_block_size + threadIdx.x; + Index_type g = blockIdx.y * g_block_size + threadIdx.y; + Index_type z = blockIdx.z * z_block_size + threadIdx.z; if (m < num_m && g < num_g && z < num_z) { body(z, g, m); @@ -100,7 +106,7 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid) LTIMES_NOVIEW_THREADS_PER_BLOCK_HIP; LTIMES_NOVIEW_NBLOCKS_HIP; - hipLaunchKernelGGL((ltimes_noview), + hipLaunchKernelGGL((ltimes_noview), dim3(nblocks), dim3(nthreads_per_block), 0, 0, phidat, elldat, psidat, num_d, @@ -129,7 +135,7 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid) } }; - hipLaunchKernelGGL((ltimes_noview_lam), + hipLaunchKernelGGL((ltimes_noview_lam), dim3(nblocks), dim3(nthreads_per_block), 0, 0, num_m, num_g, num_z, ltimes_noview_lambda); diff --git a/src/apps/PRESSURE-Cuda.cpp b/src/apps/PRESSURE-Cuda.cpp index c22f07f00..152c5cd40 100644 --- a/src/apps/PRESSURE-Cuda.cpp +++ b/src/apps/PRESSURE-Cuda.cpp @@ -36,23 +36,27 @@ namespace apps deallocCudaDeviceData(e_old); \ deallocCudaDeviceData(vnewc); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void pressurecalc1(Real_ptr bvc, Real_ptr compression, const Real_type cls, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { PRESSURE_BODY1; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void pressurecalc2(Real_ptr p_new, Real_ptr bvc, Real_ptr e_old, Real_ptr vnewc, const Real_type p_cut, const Real_type eosvmax, const Real_type pmin, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { PRESSURE_BODY2; } @@ -77,12 +81,12 @@ void PRESSURE::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - pressurecalc1<<>>( bvc, compression, + pressurecalc1<<>>( bvc, compression, cls, iend ); cudaErrchk( cudaGetLastError() ); - pressurecalc2<<>>( p_new, bvc, e_old, + pressurecalc2<<>>( p_new, bvc, e_old, vnewc, p_cut, eosvmax, pmin, iend ); diff --git a/src/apps/PRESSURE-Hip.cpp b/src/apps/PRESSURE-Hip.cpp index cdf62422f..2f5b498a7 100644 --- a/src/apps/PRESSURE-Hip.cpp +++ b/src/apps/PRESSURE-Hip.cpp @@ -36,23 +36,27 @@ namespace apps deallocHipDeviceData(e_old); \ deallocHipDeviceData(vnewc); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void pressurecalc1(Real_ptr bvc, Real_ptr compression, const Real_type cls, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { PRESSURE_BODY1; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void pressurecalc2(Real_ptr p_new, Real_ptr bvc, Real_ptr e_old, Real_ptr vnewc, const Real_type p_cut, const Real_type eosvmax, const Real_type pmin, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { PRESSURE_BODY2; } @@ -77,12 +81,12 @@ void PRESSURE::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((pressurecalc1), dim3(grid_size), dim3(block_size), 0, 0, bvc, compression, + hipLaunchKernelGGL((pressurecalc1), dim3(grid_size), dim3(block_size), 0, 0, bvc, compression, cls, iend ); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((pressurecalc2), dim3(grid_size), dim3(block_size), 0, 0, p_new, bvc, e_old, + hipLaunchKernelGGL((pressurecalc2), dim3(grid_size), dim3(block_size), 0, 0, p_new, bvc, e_old, vnewc, p_cut, eosvmax, pmin, iend ); diff --git a/src/apps/VOL3D-Cuda.cpp b/src/apps/VOL3D-Cuda.cpp index b6e83ff53..d15725087 100644 --- a/src/apps/VOL3D-Cuda.cpp +++ b/src/apps/VOL3D-Cuda.cpp @@ -36,6 +36,8 @@ namespace apps deallocCudaDeviceData(z); \ deallocCudaDeviceData(vol); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void vol3d(Real_ptr vol, const Real_ptr x0, const Real_ptr x1, const Real_ptr x2, const Real_ptr x3, @@ -52,7 +54,7 @@ __global__ void vol3d(Real_ptr vol, const Real_type vnormq, Index_type ibegin, Index_type iend) { - Index_type ii = blockIdx.x * blockDim.x + threadIdx.x; + Index_type ii = blockIdx.x * block_size + threadIdx.x; Index_type i = ii + ibegin; if (i < iend) { VOL3D_BODY; @@ -82,7 +84,7 @@ void VOL3D::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - vol3d<<>>(vol, + vol3d<<>>(vol, x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7, z0, z1, z2, z3, z4, z5, z6, z7, diff --git a/src/apps/VOL3D-Hip.cpp b/src/apps/VOL3D-Hip.cpp index 47c9495a7..d46e1aaef 100644 --- a/src/apps/VOL3D-Hip.cpp +++ b/src/apps/VOL3D-Hip.cpp @@ -36,6 +36,8 @@ namespace apps deallocHipDeviceData(z); \ deallocHipDeviceData(vol); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void vol3d(Real_ptr vol, const Real_ptr x0, const Real_ptr x1, const Real_ptr x2, const Real_ptr x3, @@ -52,7 +54,7 @@ __global__ void vol3d(Real_ptr vol, const Real_type vnormq, Index_type ibegin, Index_type iend) { - Index_type ii = blockIdx.x * blockDim.x + threadIdx.x; + Index_type ii = blockIdx.x * block_size + threadIdx.x; Index_type i = ii + ibegin; if (i < iend) { VOL3D_BODY; @@ -82,7 +84,7 @@ void VOL3D::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((vol3d), dim3(grid_size), dim3(block_size), 0, 0, vol, + hipLaunchKernelGGL((vol3d), dim3(grid_size), dim3(block_size), 0, 0, vol, x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7, z0, z1, z2, z3, z4, z5, z6, z7, From 35c5b30617c8aa0d87d96ccd5cea093afe2349d2 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 22 Oct 2021 11:34:01 -0700 Subject: [PATCH 142/392] Support block_size in basic kernels --- src/basic/DAXPY-Cuda.cpp | 19 ++++++++------ src/basic/DAXPY-Hip.cpp | 19 ++++++++------ src/basic/DAXPY.cpp | 10 ++++++++ src/basic/DAXPY.hpp | 9 +++++++ src/basic/IF_QUAD-Cuda.cpp | 19 ++++++++------ src/basic/IF_QUAD-Hip.cpp | 19 ++++++++------ src/basic/IF_QUAD.cpp | 12 ++++++++- src/basic/IF_QUAD.hpp | 9 +++++++ src/basic/INIT3-Cuda.cpp | 19 ++++++++------ src/basic/INIT3-Hip.cpp | 19 ++++++++------ src/basic/INIT3.cpp | 10 ++++++++ src/basic/INIT3.hpp | 9 +++++++ src/basic/INIT_VIEW1D-Cuda.cpp | 19 ++++++++------ src/basic/INIT_VIEW1D-Hip.cpp | 19 ++++++++------ src/basic/INIT_VIEW1D.cpp | 10 ++++++++ src/basic/INIT_VIEW1D.hpp | 9 +++++++ src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp | 19 ++++++++------ src/basic/INIT_VIEW1D_OFFSET-Hip.cpp | 19 ++++++++------ src/basic/INIT_VIEW1D_OFFSET.cpp | 10 ++++++++ src/basic/INIT_VIEW1D_OFFSET.hpp | 9 +++++++ src/basic/MULADDSUB-Cuda.cpp | 19 ++++++++------ src/basic/MULADDSUB-Hip.cpp | 19 ++++++++------ src/basic/MULADDSUB.cpp | 10 ++++++++ src/basic/MULADDSUB.hpp | 9 +++++++ src/basic/NESTED_INIT-Cuda.cpp | 36 ++++++++++++++++++--------- src/basic/NESTED_INIT-Hip.cpp | 34 +++++++++++++++++-------- src/basic/NESTED_INIT.cpp | 10 ++++++++ src/basic/NESTED_INIT.hpp | 9 +++++++ src/basic/PI_ATOMIC-Cuda.cpp | 19 ++++++++------ src/basic/PI_ATOMIC-Hip.cpp | 19 ++++++++------ src/basic/PI_ATOMIC.cpp | 10 ++++++++ src/basic/PI_ATOMIC.hpp | 9 +++++++ src/basic/PI_REDUCE-Cuda.cpp | 27 ++++++++++++-------- src/basic/PI_REDUCE-Hip.cpp | 25 +++++++++++-------- src/basic/PI_REDUCE.cpp | 10 ++++++++ src/basic/PI_REDUCE.hpp | 9 +++++++ src/basic/REDUCE3_INT-Cuda.cpp | 19 ++++++++------ src/basic/REDUCE3_INT-Hip.cpp | 19 ++++++++------ src/basic/REDUCE3_INT.cpp | 10 ++++++++ src/basic/REDUCE3_INT.hpp | 9 +++++++ src/basic/TRAP_INT-Cuda.cpp | 19 ++++++++------ src/basic/TRAP_INT-Hip.cpp | 19 ++++++++------ src/basic/TRAP_INT.cpp | 10 ++++++++ src/basic/TRAP_INT.hpp | 9 +++++++ 44 files changed, 504 insertions(+), 171 deletions(-) diff --git a/src/basic/DAXPY-Cuda.cpp b/src/basic/DAXPY-Cuda.cpp index 7e4f52fed..26a28fb55 100644 --- a/src/basic/DAXPY-Cuda.cpp +++ b/src/basic/DAXPY-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace basic { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define DAXPY_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(x, m_x, iend); \ allocAndInitCudaDeviceData(y, m_y, iend); @@ -46,7 +40,9 @@ __global__ void daxpy(Real_ptr y, Real_ptr x, } } -void DAXPY::runCudaVariant(VariantID vid) + +template < size_t block_size > +void DAXPY::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -112,6 +108,15 @@ void DAXPY::runCudaVariant(VariantID vid) } } +void DAXPY::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n DAXPY : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/DAXPY-Hip.cpp b/src/basic/DAXPY-Hip.cpp index 1ed22ef76..184820bb3 100644 --- a/src/basic/DAXPY-Hip.cpp +++ b/src/basic/DAXPY-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace basic { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define DAXPY_DATA_SETUP_HIP \ allocAndInitHipDeviceData(x, m_x, iend); \ allocAndInitHipDeviceData(y, m_y, iend); @@ -47,7 +41,9 @@ __global__ void daxpy(Real_ptr y, Real_ptr x, } -void DAXPY::runHipVariant(VariantID vid) + +template < size_t block_size > +void DAXPY::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -115,6 +111,15 @@ void DAXPY::runHipVariant(VariantID vid) } } +void DAXPY::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n DAXPY : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp index 16782df2a..06ad393eb 100644 --- a/src/basic/DAXPY.cpp +++ b/src/basic/DAXPY.cpp @@ -21,6 +21,10 @@ namespace basic DAXPY::DAXPY(const RunParams& params) : KernelBase(rajaperf::Basic_DAXPY, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(500); @@ -76,5 +80,11 @@ void DAXPY::tearDown(VariantID vid) deallocData(m_y); } +bool DAXPY::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/DAXPY.hpp b/src/basic/DAXPY.hpp index 9f0688d8a..1e6861fbe 100644 --- a/src/basic/DAXPY.hpp +++ b/src/basic/DAXPY.hpp @@ -53,7 +53,16 @@ class DAXPY : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_x; Real_ptr m_y; Real_type m_a; diff --git a/src/basic/IF_QUAD-Cuda.cpp b/src/basic/IF_QUAD-Cuda.cpp index 8c7f9fa11..ee6a70cf6 100644 --- a/src/basic/IF_QUAD-Cuda.cpp +++ b/src/basic/IF_QUAD-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace basic { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define IF_QUAD_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(a, m_a, iend); \ allocAndInitCudaDeviceData(b, m_b, iend); \ @@ -54,7 +48,9 @@ __global__ void ifquad(Real_ptr x1, Real_ptr x2, } -void IF_QUAD::runCudaVariant(VariantID vid) + +template < size_t block_size > +void IF_QUAD::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -119,6 +115,15 @@ void IF_QUAD::runCudaVariant(VariantID vid) } } +void IF_QUAD::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n IF_QUAD : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/IF_QUAD-Hip.cpp b/src/basic/IF_QUAD-Hip.cpp index 49557e3e8..c976fd1db 100644 --- a/src/basic/IF_QUAD-Hip.cpp +++ b/src/basic/IF_QUAD-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace basic { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define IF_QUAD_DATA_SETUP_HIP \ allocAndInitHipDeviceData(a, m_a, iend); \ allocAndInitHipDeviceData(b, m_b, iend); \ @@ -54,7 +48,9 @@ __global__ void ifquad(Real_ptr x1, Real_ptr x2, } -void IF_QUAD::runHipVariant(VariantID vid) + +template < size_t block_size > +void IF_QUAD::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -122,6 +118,15 @@ void IF_QUAD::runHipVariant(VariantID vid) } } +void IF_QUAD::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n IF_QUAD : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp index 2baff8244..d8a016375 100644 --- a/src/basic/IF_QUAD.cpp +++ b/src/basic/IF_QUAD.cpp @@ -21,12 +21,16 @@ namespace basic IF_QUAD::IF_QUAD(const RunParams& params) : KernelBase(rajaperf::Basic_IF_QUAD, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(180); setActualProblemSize( getTargetProblemSize() ); - setItsPerRep( getActualProblemSize() ); + setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); setBytesPerRep( (2*sizeof(Real_type) + 3*sizeof(Real_type)) * getActualProblemSize() ); setFLOPsPerRep(11 * getActualProblemSize()); // 1 sqrt @@ -86,5 +90,11 @@ void IF_QUAD::tearDown(VariantID vid) deallocData(m_x2); } +bool IF_QUAD::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/IF_QUAD.hpp b/src/basic/IF_QUAD.hpp index dad204ce3..4bcb84b9c 100644 --- a/src/basic/IF_QUAD.hpp +++ b/src/basic/IF_QUAD.hpp @@ -70,7 +70,16 @@ class IF_QUAD : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_a; Real_ptr m_b; Real_ptr m_c; diff --git a/src/basic/INIT3-Cuda.cpp b/src/basic/INIT3-Cuda.cpp index cee3e46af..b555b5387 100644 --- a/src/basic/INIT3-Cuda.cpp +++ b/src/basic/INIT3-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace basic { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define INIT3_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(out1, m_out1, iend); \ allocAndInitCudaDeviceData(out2, m_out2, iend); \ @@ -55,7 +49,9 @@ __global__ void init3(Real_ptr out1, Real_ptr out2, Real_ptr out3, } -void INIT3::runCudaVariant(VariantID vid) + +template < size_t block_size > +void INIT3::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -121,6 +117,15 @@ void INIT3::runCudaVariant(VariantID vid) } } +void INIT3::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n INIT3 : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT3-Hip.cpp b/src/basic/INIT3-Hip.cpp index 51e0f2b54..6f1d07696 100644 --- a/src/basic/INIT3-Hip.cpp +++ b/src/basic/INIT3-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace basic { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define INIT3_DATA_SETUP_HIP \ allocAndInitHipDeviceData(out1, m_out1, iend); \ allocAndInitHipDeviceData(out2, m_out2, iend); \ @@ -55,7 +49,9 @@ __global__ void init3(Real_ptr out1, Real_ptr out2, Real_ptr out3, } -void INIT3::runHipVariant(VariantID vid) + +template < size_t block_size > +void INIT3::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -123,6 +119,15 @@ void INIT3::runHipVariant(VariantID vid) } } +void INIT3::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n INIT3 : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp index cb3c14132..94be0f796 100644 --- a/src/basic/INIT3.cpp +++ b/src/basic/INIT3.cpp @@ -21,6 +21,10 @@ namespace basic INIT3::INIT3(const RunParams& params) : KernelBase(rajaperf::Basic_INIT3, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(500); @@ -83,5 +87,11 @@ void INIT3::tearDown(VariantID vid) deallocData(m_in2); } +bool INIT3::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT3.hpp b/src/basic/INIT3.hpp index 9d9de78da..0db0bbb17 100644 --- a/src/basic/INIT3.hpp +++ b/src/basic/INIT3.hpp @@ -56,7 +56,16 @@ class INIT3 : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_out1; Real_ptr m_out2; Real_ptr m_out3; diff --git a/src/basic/INIT_VIEW1D-Cuda.cpp b/src/basic/INIT_VIEW1D-Cuda.cpp index af70a9980..9d6fb30b2 100644 --- a/src/basic/INIT_VIEW1D-Cuda.cpp +++ b/src/basic/INIT_VIEW1D-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace basic { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define INIT_VIEW1D_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(a, m_a, getActualProblemSize()); @@ -45,7 +39,9 @@ __global__ void initview1d(Real_ptr a, } -void INIT_VIEW1D::runCudaVariant(VariantID vid) + +template < size_t block_size > +void INIT_VIEW1D::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -112,6 +108,15 @@ void INIT_VIEW1D::runCudaVariant(VariantID vid) } } +void INIT_VIEW1D::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n INIT_VIEW1D : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT_VIEW1D-Hip.cpp b/src/basic/INIT_VIEW1D-Hip.cpp index 1136b0d93..a497a82ed 100644 --- a/src/basic/INIT_VIEW1D-Hip.cpp +++ b/src/basic/INIT_VIEW1D-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace basic { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define INIT_VIEW1D_DATA_SETUP_HIP \ allocAndInitHipDeviceData(a, m_a, iend); @@ -45,7 +39,9 @@ __global__ void initview1d(Real_ptr a, } -void INIT_VIEW1D::runHipVariant(VariantID vid) + +template < size_t block_size > +void INIT_VIEW1D::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -115,6 +111,15 @@ void INIT_VIEW1D::runHipVariant(VariantID vid) } } +void INIT_VIEW1D::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n INIT_VIEW1D : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp index bad47eae8..ab59d0c8b 100644 --- a/src/basic/INIT_VIEW1D.cpp +++ b/src/basic/INIT_VIEW1D.cpp @@ -21,6 +21,10 @@ namespace basic INIT_VIEW1D::INIT_VIEW1D(const RunParams& params) : KernelBase(rajaperf::Basic_INIT_VIEW1D, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(2500); @@ -75,5 +79,11 @@ void INIT_VIEW1D::tearDown(VariantID vid) deallocData(m_a); } +bool INIT_VIEW1D::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT_VIEW1D.hpp b/src/basic/INIT_VIEW1D.hpp index b215439dc..23f6ede7b 100644 --- a/src/basic/INIT_VIEW1D.hpp +++ b/src/basic/INIT_VIEW1D.hpp @@ -67,7 +67,16 @@ class INIT_VIEW1D : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_a; Real_type m_val; }; diff --git a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp index 95c9e175e..a7cdc36b5 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace basic { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define INIT_VIEW1D_OFFSET_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(a, m_a, getActualProblemSize()); @@ -46,7 +40,9 @@ __global__ void initview1d_offset(Real_ptr a, } -void INIT_VIEW1D_OFFSET::runCudaVariant(VariantID vid) + +template < size_t block_size > +void INIT_VIEW1D_OFFSET::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; @@ -115,6 +111,15 @@ void INIT_VIEW1D_OFFSET::runCudaVariant(VariantID vid) } } +void INIT_VIEW1D_OFFSET::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n INIT_VIEW1D_OFFSET : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp index b2e24a703..ed58b8116 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace basic { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define INIT_VIEW1D_OFFSET_DATA_SETUP_HIP \ allocAndInitHipDeviceData(a, m_a, getActualProblemSize()); @@ -46,7 +40,9 @@ __global__ void initview1d_offset(Real_ptr a, } -void INIT_VIEW1D_OFFSET::runHipVariant(VariantID vid) + +template < size_t block_size > +void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; @@ -116,6 +112,15 @@ void INIT_VIEW1D_OFFSET::runHipVariant(VariantID vid) } } +void INIT_VIEW1D_OFFSET::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n INIT_VIEW1D_OFFSET : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp index 06519f61b..fb53cbcdb 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET.cpp @@ -21,6 +21,10 @@ namespace basic INIT_VIEW1D_OFFSET::INIT_VIEW1D_OFFSET(const RunParams& params) : KernelBase(rajaperf::Basic_INIT_VIEW1D_OFFSET, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(2500); @@ -75,5 +79,11 @@ void INIT_VIEW1D_OFFSET::tearDown(VariantID vid) deallocData(m_a); } +bool INIT_VIEW1D_OFFSET::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT_VIEW1D_OFFSET.hpp b/src/basic/INIT_VIEW1D_OFFSET.hpp index 333139909..b1b8691fc 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.hpp +++ b/src/basic/INIT_VIEW1D_OFFSET.hpp @@ -66,7 +66,16 @@ class INIT_VIEW1D_OFFSET : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_a; Real_type m_val; }; diff --git a/src/basic/MULADDSUB-Cuda.cpp b/src/basic/MULADDSUB-Cuda.cpp index 106d11865..dd041c87f 100644 --- a/src/basic/MULADDSUB-Cuda.cpp +++ b/src/basic/MULADDSUB-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace basic { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define MULADDSUB_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(out1, m_out1, iend); \ allocAndInitCudaDeviceData(out2, m_out2, iend); \ @@ -55,7 +49,9 @@ __global__ void muladdsub(Real_ptr out1, Real_ptr out2, Real_ptr out3, } -void MULADDSUB::runCudaVariant(VariantID vid) + +template < size_t block_size > +void MULADDSUB::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -121,6 +117,15 @@ void MULADDSUB::runCudaVariant(VariantID vid) } } +void MULADDSUB::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n MULADDSUB : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/MULADDSUB-Hip.cpp b/src/basic/MULADDSUB-Hip.cpp index 729c6cee3..2409a50b6 100644 --- a/src/basic/MULADDSUB-Hip.cpp +++ b/src/basic/MULADDSUB-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace basic { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define MULADDSUB_DATA_SETUP_HIP \ allocAndInitHipDeviceData(out1, m_out1, iend); \ allocAndInitHipDeviceData(out2, m_out2, iend); \ @@ -55,7 +49,9 @@ __global__ void muladdsub(Real_ptr out1, Real_ptr out2, Real_ptr out3, } -void MULADDSUB::runHipVariant(VariantID vid) + +template < size_t block_size > +void MULADDSUB::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -123,6 +119,15 @@ void MULADDSUB::runHipVariant(VariantID vid) } } +void MULADDSUB::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n MULADDSUB : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp index baa201dc1..41f6c42f9 100644 --- a/src/basic/MULADDSUB.cpp +++ b/src/basic/MULADDSUB.cpp @@ -21,6 +21,10 @@ namespace basic MULADDSUB::MULADDSUB(const RunParams& params) : KernelBase(rajaperf::Basic_MULADDSUB, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(350); @@ -83,5 +87,11 @@ void MULADDSUB::tearDown(VariantID vid) deallocData(m_in2); } +bool MULADDSUB::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/MULADDSUB.hpp b/src/basic/MULADDSUB.hpp index afb0a5f38..ac9b10ff6 100644 --- a/src/basic/MULADDSUB.hpp +++ b/src/basic/MULADDSUB.hpp @@ -59,7 +59,16 @@ class MULADDSUB : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_out1; Real_ptr m_out2; Real_ptr m_out3; diff --git a/src/basic/NESTED_INIT-Cuda.cpp b/src/basic/NESTED_INIT-Cuda.cpp index 306a9a67a..66ce2353e 100644 --- a/src/basic/NESTED_INIT-Cuda.cpp +++ b/src/basic/NESTED_INIT-Cuda.cpp @@ -22,14 +22,15 @@ namespace basic { // - // Define thread block size for CUDA execution + // Define thread block shape for CUDA execution // - constexpr size_t i_block_sz = 32; - constexpr size_t j_block_sz = 8; - constexpr size_t k_block_sz = 1; +#define i_block_sz (32) +#define j_block_sz (block_size / i_block_sz) +#define k_block_sz (1) #define NESTED_INIT_THREADS_PER_BLOCK_CUDA \ - dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz); + dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz); \ + static_assert(i_block_sz*j_block_sz*k_block_sz == block_size, "Invalid block_size"); #define NESTED_INIT_NBLOCKS_CUDA \ dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(ni, i_block_sz)), \ @@ -59,18 +60,20 @@ __global__ void nested_init(Real_ptr array, template< typename Lambda > __global__ void nested_init_lam(Index_type ni, Index_type nj, Index_type nk, Lambda body) -{ +{ Index_type i = blockIdx.x * blockDim.x + threadIdx.x; Index_type j = blockIdx.y * blockDim.y + threadIdx.y; Index_type k = blockIdx.z; - + if ( i < ni && j < nj && k < nk ) { body(i, j, k); } } -void NESTED_INIT::runCudaVariant(VariantID vid) + +template < size_t block_size > +void NESTED_INIT::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -85,7 +88,7 @@ void NESTED_INIT::runCudaVariant(VariantID vid) NESTED_INIT_THREADS_PER_BLOCK_CUDA; NESTED_INIT_NBLOCKS_CUDA; - + nested_init<<>>(array, ni, nj, nk); cudaErrchk( cudaGetLastError() ); @@ -124,13 +127,13 @@ void NESTED_INIT::runCudaVariant(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::CudaKernelFixedAsync, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_direct, - RAJA::statement::Tile<0, RAJA::tile_fixed, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_direct, RAJA::statement::For<2, RAJA::cuda_block_z_direct, // k RAJA::statement::For<1, RAJA::cuda_thread_y_direct, // j - RAJA::statement::For<0, RAJA::cuda_thread_x_direct, // i + RAJA::statement::For<0, RAJA::cuda_thread_x_direct, // i RAJA::statement::Lambda<0> > > @@ -161,6 +164,15 @@ void NESTED_INIT::runCudaVariant(VariantID vid) } } +void NESTED_INIT::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n NESTED_INIT : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/NESTED_INIT-Hip.cpp b/src/basic/NESTED_INIT-Hip.cpp index 4038a47a2..c5b27c52a 100644 --- a/src/basic/NESTED_INIT-Hip.cpp +++ b/src/basic/NESTED_INIT-Hip.cpp @@ -22,14 +22,15 @@ namespace basic { // - // Define thread block size for Hip execution + // Define thread block shape for Hip execution // - constexpr size_t i_block_sz = 32; - constexpr size_t j_block_sz = 8; - constexpr size_t k_block_sz = 1; +#define i_block_sz (32) +#define j_block_sz (block_size / i_block_sz) +#define k_block_sz (1) #define NESTED_INIT_THREADS_PER_BLOCK_HIP \ - dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz); + dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz); \ + static_assert(i_block_sz*j_block_sz*k_block_sz == block_size, "Invalid block_size"); #define NESTED_INIT_NBLOCKS_HIP \ dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(ni, i_block_sz)), \ @@ -70,7 +71,9 @@ __global__ void nested_init_lam(Index_type ni, Index_type nj, Index_type nk, } -void NESTED_INIT::runHipVariant(VariantID vid) + +template < size_t block_size > +void NESTED_INIT::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -86,8 +89,8 @@ void NESTED_INIT::runHipVariant(VariantID vid) NESTED_INIT_THREADS_PER_BLOCK_HIP; NESTED_INIT_NBLOCKS_HIP; - hipLaunchKernelGGL((nested_init), - dim3(nblocks), dim3(nthreads_per_block), 0, 0, + hipLaunchKernelGGL((nested_init), + dim3(nblocks), dim3(nthreads_per_block), 0, 0, array, ni, nj, nk); hipErrchk( hipGetLastError() ); @@ -106,12 +109,12 @@ void NESTED_INIT::runHipVariant(VariantID vid) NESTED_INIT_THREADS_PER_BLOCK_HIP; NESTED_INIT_NBLOCKS_HIP; - auto nested_init_lambda = [=] __device__ (Index_type i, Index_type j, + auto nested_init_lambda = [=] __device__ (Index_type i, Index_type j, Index_type k) { NESTED_INIT_BODY; }; - hipLaunchKernelGGL((nested_init_lam< decltype(nested_init_lambda) >), + hipLaunchKernelGGL((nested_init_lam< decltype(nested_init_lambda) >), dim3(nblocks), dim3(nthreads_per_block), 0, 0, ni, nj, nk, nested_init_lambda); hipErrchk( hipGetLastError() ); @@ -142,7 +145,7 @@ void NESTED_INIT::runHipVariant(VariantID vid) > > > - >; + >; startTimer(); @@ -165,6 +168,15 @@ void NESTED_INIT::runHipVariant(VariantID vid) } } +void NESTED_INIT::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n NESTED_INIT : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp index 77d847691..5dff01b8a 100644 --- a/src/basic/NESTED_INIT.cpp +++ b/src/basic/NESTED_INIT.cpp @@ -24,6 +24,10 @@ namespace basic NESTED_INIT::NESTED_INIT(const RunParams& params) : KernelBase(rajaperf::Basic_NESTED_INIT, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + m_n_init = 100; setDefaultProblemSize(m_n_init * m_n_init * m_n_init); @@ -85,5 +89,11 @@ void NESTED_INIT::tearDown(VariantID vid) m_array = 0; } +bool NESTED_INIT::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/NESTED_INIT.hpp b/src/basic/NESTED_INIT.hpp index 508ba8030..880ce9549 100644 --- a/src/basic/NESTED_INIT.hpp +++ b/src/basic/NESTED_INIT.hpp @@ -59,7 +59,16 @@ class NESTED_INIT : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Index_type m_array_length; Real_ptr m_array; diff --git a/src/basic/PI_ATOMIC-Cuda.cpp b/src/basic/PI_ATOMIC-Cuda.cpp index 57522fed3..268543099 100644 --- a/src/basic/PI_ATOMIC-Cuda.cpp +++ b/src/basic/PI_ATOMIC-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace basic { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define PI_ATOMIC_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(pi, m_pi, 1); @@ -45,7 +39,9 @@ __global__ void pi_atomic(Real_ptr pi, } -void PI_ATOMIC::runCudaVariant(VariantID vid) + +template < size_t block_size > +void PI_ATOMIC::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -127,6 +123,15 @@ void PI_ATOMIC::runCudaVariant(VariantID vid) } } +void PI_ATOMIC::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n PI_ATOMIC : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp index 0910a4198..c1f720900 100644 --- a/src/basic/PI_ATOMIC-Hip.cpp +++ b/src/basic/PI_ATOMIC-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace basic { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define PI_ATOMIC_DATA_SETUP_HIP \ allocAndInitHipDeviceData(pi, m_pi, 1); @@ -45,7 +39,9 @@ __global__ void atomic_pi(Real_ptr pi, } -void PI_ATOMIC::runHipVariant(VariantID vid) + +template < size_t block_size > +void PI_ATOMIC::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -129,6 +125,15 @@ void PI_ATOMIC::runHipVariant(VariantID vid) } } +void PI_ATOMIC::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n PI_ATOMIC : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp index 94e29c8ae..019664280 100644 --- a/src/basic/PI_ATOMIC.cpp +++ b/src/basic/PI_ATOMIC.cpp @@ -21,6 +21,10 @@ namespace basic PI_ATOMIC::PI_ATOMIC(const RunParams& params) : KernelBase(rajaperf::Basic_PI_ATOMIC, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(50); @@ -77,5 +81,11 @@ void PI_ATOMIC::tearDown(VariantID vid) deallocData(m_pi); } +bool PI_ATOMIC::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_ATOMIC.hpp b/src/basic/PI_ATOMIC.hpp index 9c71d2d70..aedcd606c 100644 --- a/src/basic/PI_ATOMIC.hpp +++ b/src/basic/PI_ATOMIC.hpp @@ -55,7 +55,16 @@ class PI_ATOMIC : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_type m_dx; Real_ptr m_pi; Real_type m_pi_init; diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index 0c9d38c13..806690391 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace basic { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - __global__ void pi_reduce(Real_type dx, Real_ptr dpi, Real_type pi_init, Index_type iend) @@ -38,7 +32,7 @@ __global__ void pi_reduce(Real_type dx, ppi[ threadIdx.x ] = pi_init; for ( ; i < iend ; i += gridDim.x * blockDim.x ) { double x = (double(i) + 0.5) * dx; - ppi[ threadIdx.x ] += dx / (1.0 + x * x); + ppi[ threadIdx.x ] += dx / (1.0 + x * x); } __syncthreads(); @@ -57,11 +51,13 @@ __global__ void pi_reduce(Real_type dx, if ( threadIdx.x == 0 ) { *dpi += ppi[ 0 ]; } -#endif +#endif } -void PI_REDUCE::runCudaVariant(VariantID vid) + +template < size_t block_size > +void PI_REDUCE::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -81,8 +77,8 @@ void PI_REDUCE::runCudaVariant(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); pi_reduce<<>>( dx, - dpi, m_pi_init, + sizeof(Real_type)*block_size>>>( dx, + dpi, m_pi_init, iend ); cudaErrchk( cudaGetLastError() ); @@ -119,6 +115,15 @@ void PI_REDUCE::runCudaVariant(VariantID vid) } } +void PI_REDUCE::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n PI_REDUCE : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index 41a0a8ae9..6ece8f94e 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace basic { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - __global__ void pi_reduce(Real_type dx, Real_ptr dpi, Real_type pi_init, Index_type iend) @@ -38,7 +32,7 @@ __global__ void pi_reduce(Real_type dx, ppi[ threadIdx.x ] = pi_init; for ( ; i < iend ; i += gridDim.x * blockDim.x ) { double x = (double(i) + 0.5) * dx; - ppi[ threadIdx.x ] += dx / (1.0 + x * x); + ppi[ threadIdx.x ] += dx / (1.0 + x * x); } __syncthreads(); @@ -57,11 +51,13 @@ __global__ void pi_reduce(Real_type dx, if ( threadIdx.x == 0 ) i{ *dpi += ppi[ 0 ]; } -#endif +#endif } -void PI_REDUCE::runHipVariant(VariantID vid) + +template < size_t block_size > +void PI_REDUCE::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -80,7 +76,7 @@ void PI_REDUCE::runHipVariant(VariantID vid) initHipDeviceData(dpi, &m_pi_init, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL( (pi_reduce), dim3(grid_size), dim3(block_size), + hipLaunchKernelGGL( (pi_reduce), dim3(grid_size), dim3(block_size), sizeof(Real_type)*block_size, 0, dx, dpi, m_pi_init, iend ); hipErrchk( hipGetLastError() ); @@ -118,6 +114,15 @@ void PI_REDUCE::runHipVariant(VariantID vid) } } +void PI_REDUCE::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n PI_REDUCE : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_REDUCE.cpp b/src/basic/PI_REDUCE.cpp index b7032e61d..7e31227a7 100644 --- a/src/basic/PI_REDUCE.cpp +++ b/src/basic/PI_REDUCE.cpp @@ -21,6 +21,10 @@ namespace basic PI_REDUCE::PI_REDUCE(const RunParams& params) : KernelBase(rajaperf::Basic_PI_REDUCE, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(50); @@ -75,5 +79,11 @@ void PI_REDUCE::tearDown(VariantID vid) (void) vid; } +bool PI_REDUCE::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp index 59ea5321a..42c7a5dbe 100644 --- a/src/basic/PI_REDUCE.hpp +++ b/src/basic/PI_REDUCE.hpp @@ -57,7 +57,16 @@ class PI_REDUCE : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_type m_dx; Real_type m_pi; Real_type m_pi_init; diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index 50481f5b2..2d63c6480 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace basic { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define REDUCE3_INT_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(vec, m_vec, iend); @@ -82,7 +76,9 @@ __global__ void reduce3int(Int_ptr vec, } -void REDUCE3_INT::runCudaVariant(VariantID vid) + +template < size_t block_size > +void REDUCE3_INT::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -163,6 +159,15 @@ void REDUCE3_INT::runCudaVariant(VariantID vid) } } +void REDUCE3_INT::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n REDUCE3_INT : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index ba13fa8af..36fa085ef 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace basic { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define REDUCE3_INT_DATA_SETUP_HIP \ allocAndInitHipDeviceData(vec, m_vec, iend); @@ -82,7 +76,9 @@ __global__ void reduce3int(Int_ptr vec, } -void REDUCE3_INT::runHipVariant(VariantID vid) + +template < size_t block_size > +void REDUCE3_INT::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -163,6 +159,15 @@ void REDUCE3_INT::runHipVariant(VariantID vid) } } +void REDUCE3_INT::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n REDUCE3_INT : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp index 821a4b7e3..53eb4670c 100644 --- a/src/basic/REDUCE3_INT.cpp +++ b/src/basic/REDUCE3_INT.cpp @@ -23,6 +23,10 @@ namespace basic REDUCE3_INT::REDUCE3_INT(const RunParams& params) : KernelBase(rajaperf::Basic_REDUCE3_INT, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); //setDefaultReps(5000); // Set reps to low value until we resolve RAJA omp-target @@ -87,5 +91,11 @@ void REDUCE3_INT::tearDown(VariantID vid) deallocData(m_vec); } +bool REDUCE3_INT::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp index b3acc5004..c51ddebe0 100644 --- a/src/basic/REDUCE3_INT.hpp +++ b/src/basic/REDUCE3_INT.hpp @@ -71,7 +71,16 @@ class REDUCE3_INT : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Int_ptr m_vec; Int_type m_vsum; Int_type m_vsum_init; diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index 305104c4a..3cd5d467e 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -37,12 +37,6 @@ Real_type trap_int_func(Real_type x, } - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define TRAP_INT_DATA_SETUP_CUDA // nothing to do here... #define TRAP_INT_DATA_TEARDOWN_CUDA // nothing to do here... @@ -86,7 +80,9 @@ __global__ void trapint(Real_type x0, Real_type xp, } -void TRAP_INT::runCudaVariant(VariantID vid) + +template < size_t block_size > +void TRAP_INT::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -153,6 +149,15 @@ void TRAP_INT::runCudaVariant(VariantID vid) } } +void TRAP_INT::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n TRAP_INT : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index 40e6158bb..a50ec661f 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -37,12 +37,6 @@ Real_type trap_int_func(Real_type x, } - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define TRAP_INT_DATA_SETUP_HIP // nothing to do here... #define TRAP_INT_DATA_TEARDOWN_HIP // nothing to do here... @@ -86,7 +80,9 @@ __global__ void trapint(Real_type x0, Real_type xp, } -void TRAP_INT::runHipVariant(VariantID vid) + +template < size_t block_size > +void TRAP_INT::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -152,6 +148,15 @@ void TRAP_INT::runHipVariant(VariantID vid) } } +void TRAP_INT::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n TRAP_INT : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp index e7483d9f4..5db4f8cd1 100644 --- a/src/basic/TRAP_INT.cpp +++ b/src/basic/TRAP_INT.cpp @@ -21,6 +21,10 @@ namespace basic TRAP_INT::TRAP_INT(const RunParams& params) : KernelBase(rajaperf::Basic_TRAP_INT, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(50); @@ -84,5 +88,11 @@ void TRAP_INT::tearDown(VariantID vid) (void) vid; } +bool TRAP_INT::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp index 171d72418..8bb8f5f7d 100644 --- a/src/basic/TRAP_INT.hpp +++ b/src/basic/TRAP_INT.hpp @@ -68,7 +68,16 @@ class TRAP_INT : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_type m_x0; Real_type m_xp; Real_type m_y; From 55ac25412f22fb69aeac568007cd357169604ae8 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 22 Oct 2021 12:24:17 -0700 Subject: [PATCH 143/392] Template basic gpu kernels on block_size --- src/basic/DAXPY-Cuda.cpp | 8 +++++--- src/basic/DAXPY-Hip.cpp | 8 +++++--- src/basic/IF_QUAD-Cuda.cpp | 8 +++++--- src/basic/IF_QUAD-Hip.cpp | 8 +++++--- src/basic/INIT3-Cuda.cpp | 8 +++++--- src/basic/INIT3-Hip.cpp | 8 +++++--- src/basic/INIT_VIEW1D-Cuda.cpp | 8 +++++--- src/basic/INIT_VIEW1D-Hip.cpp | 8 +++++--- src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp | 8 +++++--- src/basic/INIT_VIEW1D_OFFSET-Hip.cpp | 8 +++++--- src/basic/MULADDSUB-Cuda.cpp | 8 +++++--- src/basic/MULADDSUB-Hip.cpp | 8 +++++--- src/basic/NESTED_INIT-Cuda.cpp | 24 ++++++++++++++++-------- src/basic/NESTED_INIT-Hip.cpp | 22 ++++++++++++++-------- src/basic/PI_ATOMIC-Cuda.cpp | 8 +++++--- src/basic/PI_ATOMIC-Hip.cpp | 8 +++++--- src/basic/PI_REDUCE-Cuda.cpp | 10 ++++++---- src/basic/PI_REDUCE-Hip.cpp | 10 ++++++---- src/basic/REDUCE3_INT-Cuda.cpp | 14 ++++++++------ src/basic/REDUCE3_INT-Hip.cpp | 14 ++++++++------ src/basic/TRAP_INT-Cuda.cpp | 10 ++++++---- src/basic/TRAP_INT-Hip.cpp | 10 ++++++---- 22 files changed, 140 insertions(+), 86 deletions(-) diff --git a/src/basic/DAXPY-Cuda.cpp b/src/basic/DAXPY-Cuda.cpp index 26a28fb55..767381f89 100644 --- a/src/basic/DAXPY-Cuda.cpp +++ b/src/basic/DAXPY-Cuda.cpp @@ -30,11 +30,13 @@ namespace basic deallocCudaDeviceData(x); \ deallocCudaDeviceData(y); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void daxpy(Real_ptr y, Real_ptr x, Real_type a, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { DAXPY_BODY; } @@ -58,7 +60,7 @@ void DAXPY::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - daxpy<<>>( y, x, a, + daxpy<<>>( y, x, a, iend ); cudaErrchk( cudaGetLastError() ); @@ -75,7 +77,7 @@ void DAXPY::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - lambda_cuda_forall<<>>( + lambda_cuda_forall<<>>( ibegin, iend, [=] __device__ (Index_type i) { DAXPY_BODY; }); diff --git a/src/basic/DAXPY-Hip.cpp b/src/basic/DAXPY-Hip.cpp index 184820bb3..7300d2c3f 100644 --- a/src/basic/DAXPY-Hip.cpp +++ b/src/basic/DAXPY-Hip.cpp @@ -30,11 +30,13 @@ namespace basic deallocHipDeviceData(x); \ deallocHipDeviceData(y); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void daxpy(Real_ptr y, Real_ptr x, Real_type a, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { DAXPY_BODY; } @@ -59,7 +61,7 @@ void DAXPY::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((daxpy),dim3(grid_size), dim3(block_size), 0, 0, y, x, a, + hipLaunchKernelGGL((daxpy),dim3(grid_size), dim3(block_size), 0, 0, y, x, a, iend ); hipErrchk( hipGetLastError() ); @@ -80,7 +82,7 @@ void DAXPY::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL(lambda_hip_forall, + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, daxpy_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/basic/IF_QUAD-Cuda.cpp b/src/basic/IF_QUAD-Cuda.cpp index ee6a70cf6..42af59728 100644 --- a/src/basic/IF_QUAD-Cuda.cpp +++ b/src/basic/IF_QUAD-Cuda.cpp @@ -37,11 +37,13 @@ namespace basic deallocCudaDeviceData(x1); \ deallocCudaDeviceData(x2); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void ifquad(Real_ptr x1, Real_ptr x2, Real_ptr a, Real_ptr b, Real_ptr c, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { IF_QUAD_BODY; } @@ -66,7 +68,7 @@ void IF_QUAD::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - ifquad<<>>( x1, x2, a, b, c, iend ); + ifquad<<>>( x1, x2, a, b, c, iend ); cudaErrchk( cudaGetLastError() ); } @@ -82,7 +84,7 @@ void IF_QUAD::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - lambda_cuda_forall<<>>( + lambda_cuda_forall<<>>( ibegin, iend, [=] __device__ (Index_type i) { IF_QUAD_BODY; }); diff --git a/src/basic/IF_QUAD-Hip.cpp b/src/basic/IF_QUAD-Hip.cpp index c976fd1db..700fb1ab6 100644 --- a/src/basic/IF_QUAD-Hip.cpp +++ b/src/basic/IF_QUAD-Hip.cpp @@ -37,11 +37,13 @@ namespace basic deallocHipDeviceData(x1); \ deallocHipDeviceData(x2); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void ifquad(Real_ptr x1, Real_ptr x2, Real_ptr a, Real_ptr b, Real_ptr c, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { IF_QUAD_BODY; } @@ -66,7 +68,7 @@ void IF_QUAD::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((ifquad), dim3(grid_size), dim3(block_size), 0, 0, x1, x2, a, b, c, + hipLaunchKernelGGL((ifquad), dim3(grid_size), dim3(block_size), 0, 0, x1, x2, a, b, c, iend ); hipErrchk( hipGetLastError() ); @@ -87,7 +89,7 @@ void IF_QUAD::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL(lambda_hip_forall, + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, ifquad_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/basic/INIT3-Cuda.cpp b/src/basic/INIT3-Cuda.cpp index b555b5387..dbe232007 100644 --- a/src/basic/INIT3-Cuda.cpp +++ b/src/basic/INIT3-Cuda.cpp @@ -38,11 +38,13 @@ namespace basic deallocCudaDeviceData(in1); \ deallocCudaDeviceData(in2); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void init3(Real_ptr out1, Real_ptr out2, Real_ptr out3, Real_ptr in1, Real_ptr in2, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { INIT3_BODY; } @@ -67,7 +69,7 @@ void INIT3::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - init3<<>>( out1, out2, out3, in1, in2, + init3<<>>( out1, out2, out3, in1, in2, iend ); cudaErrchk( cudaGetLastError() ); @@ -84,7 +86,7 @@ void INIT3::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - lambda_cuda_forall<<>>( + lambda_cuda_forall<<>>( ibegin, iend, [=] __device__ (Index_type i) { INIT3_BODY; }); diff --git a/src/basic/INIT3-Hip.cpp b/src/basic/INIT3-Hip.cpp index 6f1d07696..30f9cf844 100644 --- a/src/basic/INIT3-Hip.cpp +++ b/src/basic/INIT3-Hip.cpp @@ -38,11 +38,13 @@ namespace basic deallocHipDeviceData(in1); \ deallocHipDeviceData(in2); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void init3(Real_ptr out1, Real_ptr out2, Real_ptr out3, Real_ptr in1, Real_ptr in2, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { INIT3_BODY; } @@ -67,7 +69,7 @@ void INIT3::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((init3), dim3(grid_size), dim3(block_size), 0, 0, out1, out2, out3, in1, in2, + hipLaunchKernelGGL((init3), dim3(grid_size), dim3(block_size), 0, 0, out1, out2, out3, in1, in2, iend ); hipErrchk( hipGetLastError() ); @@ -88,7 +90,7 @@ void INIT3::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL(lambda_hip_forall, + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, init3_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/basic/INIT_VIEW1D-Cuda.cpp b/src/basic/INIT_VIEW1D-Cuda.cpp index 9d6fb30b2..665a312ff 100644 --- a/src/basic/INIT_VIEW1D-Cuda.cpp +++ b/src/basic/INIT_VIEW1D-Cuda.cpp @@ -28,11 +28,13 @@ namespace basic getCudaDeviceData(m_a, a, getActualProblemSize()); \ deallocCudaDeviceData(a); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void initview1d(Real_ptr a, Real_type v, const Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { INIT_VIEW1D_BODY; } @@ -57,7 +59,7 @@ void INIT_VIEW1D::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - initview1d<<>>( a, v, iend ); + initview1d<<>>( a, v, iend ); cudaErrchk( cudaGetLastError() ); } @@ -73,7 +75,7 @@ void INIT_VIEW1D::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - lambda_cuda_forall<<>>( + lambda_cuda_forall<<>>( ibegin, iend, [=] __device__ (Index_type i) { INIT_VIEW1D_BODY; }); diff --git a/src/basic/INIT_VIEW1D-Hip.cpp b/src/basic/INIT_VIEW1D-Hip.cpp index a497a82ed..fd9317670 100644 --- a/src/basic/INIT_VIEW1D-Hip.cpp +++ b/src/basic/INIT_VIEW1D-Hip.cpp @@ -28,11 +28,13 @@ namespace basic getHipDeviceData(m_a, a, iend); \ deallocHipDeviceData(a); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void initview1d(Real_ptr a, Real_type v, const Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { INIT_VIEW1D_BODY; } @@ -57,7 +59,7 @@ void INIT_VIEW1D::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((initview1d), dim3(grid_size), dim3(block_size), 0, 0, + hipLaunchKernelGGL((initview1d), dim3(grid_size), dim3(block_size), 0, 0, a, v, iend ); hipErrchk( hipGetLastError() ); @@ -78,7 +80,7 @@ void INIT_VIEW1D::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL(lambda_hip_forall, + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, initview1d_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp index a7cdc36b5..348c40bb4 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp @@ -28,12 +28,14 @@ namespace basic getCudaDeviceData(m_a, a, getActualProblemSize()); \ deallocCudaDeviceData(a); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void initview1d_offset(Real_ptr a, Real_type v, const Index_type ibegin, const Index_type iend) { - Index_type i = ibegin + blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = ibegin + blockIdx.x * block_size + threadIdx.x; if (i < iend) { INIT_VIEW1D_OFFSET_BODY; } @@ -58,7 +60,7 @@ void INIT_VIEW1D_OFFSET::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); - initview1d_offset<<>>( a, v, + initview1d_offset<<>>( a, v, ibegin, iend ); cudaErrchk( cudaGetLastError() ); @@ -76,7 +78,7 @@ void INIT_VIEW1D_OFFSET::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); - lambda_cuda_forall<<>>( + lambda_cuda_forall<<>>( ibegin, iend, [=] __device__ (Index_type i) { INIT_VIEW1D_OFFSET_BODY; }); diff --git a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp index ed58b8116..7276cca69 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp @@ -28,12 +28,14 @@ namespace basic getHipDeviceData(m_a, a, getActualProblemSize()); \ deallocHipDeviceData(a); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void initview1d_offset(Real_ptr a, Real_type v, const Index_type ibegin, const Index_type iend) { - Index_type i = ibegin + blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = ibegin + blockIdx.x * block_size + threadIdx.x; if (i < iend) { INIT_VIEW1D_OFFSET_BODY; } @@ -58,7 +60,7 @@ void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); - hipLaunchKernelGGL((initview1d_offset), dim3(grid_size), dim3(block_size), 0, 0, + hipLaunchKernelGGL((initview1d_offset), dim3(grid_size), dim3(block_size), 0, 0, a, v, ibegin, iend ); hipErrchk( hipGetLastError() ); @@ -79,7 +81,7 @@ void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); - hipLaunchKernelGGL(lambda_hip_forall, + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, initview1d_offset_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/basic/MULADDSUB-Cuda.cpp b/src/basic/MULADDSUB-Cuda.cpp index dd041c87f..de912b6c0 100644 --- a/src/basic/MULADDSUB-Cuda.cpp +++ b/src/basic/MULADDSUB-Cuda.cpp @@ -38,11 +38,13 @@ namespace basic deallocCudaDeviceData(in1); \ deallocCudaDeviceData(in2); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void muladdsub(Real_ptr out1, Real_ptr out2, Real_ptr out3, Real_ptr in1, Real_ptr in2, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { MULADDSUB_BODY; } @@ -67,7 +69,7 @@ void MULADDSUB::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - muladdsub<<>>( out1, out2, out3, in1, in2, + muladdsub<<>>( out1, out2, out3, in1, in2, iend ); cudaErrchk( cudaGetLastError() ); @@ -84,7 +86,7 @@ void MULADDSUB::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - lambda_cuda_forall<<>>( + lambda_cuda_forall<<>>( ibegin, iend, [=] __device__ (Index_type i) { MULADDSUB_BODY; }); diff --git a/src/basic/MULADDSUB-Hip.cpp b/src/basic/MULADDSUB-Hip.cpp index 2409a50b6..d2fef2ab4 100644 --- a/src/basic/MULADDSUB-Hip.cpp +++ b/src/basic/MULADDSUB-Hip.cpp @@ -38,11 +38,13 @@ namespace basic deallocHipDeviceData(in1); \ deallocHipDeviceData(in2); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void muladdsub(Real_ptr out1, Real_ptr out2, Real_ptr out3, Real_ptr in1, Real_ptr in2, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { MULADDSUB_BODY; } @@ -67,7 +69,7 @@ void MULADDSUB::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((muladdsub), dim3(grid_size), dim3(block_size), 0, 0, + hipLaunchKernelGGL((muladdsub), dim3(grid_size), dim3(block_size), 0, 0, out1, out2, out3, in1, in2, iend ); hipErrchk( hipGetLastError() ); @@ -88,7 +90,7 @@ void MULADDSUB::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL(lambda_hip_forall, + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, muladdsub_lambda ); hipErrchk( hipGetLastError() ); diff --git a/src/basic/NESTED_INIT-Cuda.cpp b/src/basic/NESTED_INIT-Cuda.cpp index 66ce2353e..a6e263f23 100644 --- a/src/basic/NESTED_INIT-Cuda.cpp +++ b/src/basic/NESTED_INIT-Cuda.cpp @@ -28,8 +28,11 @@ namespace basic #define j_block_sz (block_size / i_block_sz) #define k_block_sz (1) +#define NESTED_INIT_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \ + i_block_sz, j_block_sz, k_block_sz + #define NESTED_INIT_THREADS_PER_BLOCK_CUDA \ - dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz); \ + dim3 nthreads_per_block(NESTED_INIT_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA); \ static_assert(i_block_sz*j_block_sz*k_block_sz == block_size, "Invalid block_size"); #define NESTED_INIT_NBLOCKS_CUDA \ @@ -45,11 +48,13 @@ namespace basic getCudaDeviceData(m_array, array, m_array_length); \ deallocCudaDeviceData(array); +template< size_t i_block_size, size_t j_block_size, size_t k_block_size > +__launch_bounds__(i_block_size*j_block_size*k_block_size) __global__ void nested_init(Real_ptr array, Index_type ni, Index_type nj, Index_type nk) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - Index_type j = blockIdx.y * blockDim.y + threadIdx.y; + Index_type i = blockIdx.x * i_block_size + threadIdx.x; + Index_type j = blockIdx.y * j_block_size + threadIdx.y; Index_type k = blockIdx.z; if ( i < ni && j < nj && k < nk ) { @@ -57,12 +62,13 @@ __global__ void nested_init(Real_ptr array, } } -template< typename Lambda > +template< size_t i_block_size, size_t j_block_size, size_t k_block_size, typename Lambda > +__launch_bounds__(i_block_size*j_block_size*k_block_size) __global__ void nested_init_lam(Index_type ni, Index_type nj, Index_type nk, Lambda body) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - Index_type j = blockIdx.y * blockDim.y + threadIdx.y; + Index_type i = blockIdx.x * i_block_size + threadIdx.x; + Index_type j = blockIdx.y * j_block_size + threadIdx.y; Index_type k = blockIdx.z; if ( i < ni && j < nj && k < nk ) { @@ -89,7 +95,8 @@ void NESTED_INIT::runCudaVariantImpl(VariantID vid) NESTED_INIT_THREADS_PER_BLOCK_CUDA; NESTED_INIT_NBLOCKS_CUDA; - nested_init<<>>(array, + nested_init + <<>>(array, ni, nj, nk); cudaErrchk( cudaGetLastError() ); @@ -108,7 +115,8 @@ void NESTED_INIT::runCudaVariantImpl(VariantID vid) NESTED_INIT_THREADS_PER_BLOCK_CUDA; NESTED_INIT_NBLOCKS_CUDA; - nested_init_lam<<>>(ni, nj, nk, + nested_init_lam + <<>>(ni, nj, nk, [=] __device__ (Index_type i, Index_type j, Index_type k) { NESTED_INIT_BODY; } diff --git a/src/basic/NESTED_INIT-Hip.cpp b/src/basic/NESTED_INIT-Hip.cpp index c5b27c52a..cb03cc79e 100644 --- a/src/basic/NESTED_INIT-Hip.cpp +++ b/src/basic/NESTED_INIT-Hip.cpp @@ -28,8 +28,11 @@ namespace basic #define j_block_sz (block_size / i_block_sz) #define k_block_sz (1) +#define NESTED_INIT_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \ + i_block_sz, j_block_sz, k_block_sz + #define NESTED_INIT_THREADS_PER_BLOCK_HIP \ - dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz); \ + dim3 nthreads_per_block(NESTED_INIT_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP); \ static_assert(i_block_sz*j_block_sz*k_block_sz == block_size, "Invalid block_size"); #define NESTED_INIT_NBLOCKS_HIP \ @@ -45,11 +48,13 @@ namespace basic getHipDeviceData(m_array, array, m_array_length); \ deallocHipDeviceData(array); +template< size_t i_block_size, size_t j_block_size, size_t k_block_size > + __launch_bounds__(i_block_size*j_block_size*k_block_size) __global__ void nested_init(Real_ptr array, Index_type ni, Index_type nj, Index_type nk) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - Index_type j = blockIdx.y * blockDim.y + threadIdx.y; + Index_type i = blockIdx.x * i_block_size + threadIdx.x; + Index_type j = blockIdx.y * j_block_size + threadIdx.y; Index_type k = blockIdx.z; if ( i < ni && j < nj && k < nk ) { @@ -57,12 +62,13 @@ __global__ void nested_init(Real_ptr array, } } -template +template< size_t i_block_size, size_t j_block_size, size_t k_block_size, typename Lambda > +__launch_bounds__(i_block_size*j_block_size*k_block_size) __global__ void nested_init_lam(Index_type ni, Index_type nj, Index_type nk, Lambda body) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - Index_type j = blockIdx.y * blockDim.y + threadIdx.y; + Index_type i = blockIdx.x * i_block_size + threadIdx.x; + Index_type j = blockIdx.y * j_block_size + threadIdx.y; Index_type k = blockIdx.z; if ( i < ni && j < nj && k < nk ) { @@ -89,7 +95,7 @@ void NESTED_INIT::runHipVariantImpl(VariantID vid) NESTED_INIT_THREADS_PER_BLOCK_HIP; NESTED_INIT_NBLOCKS_HIP; - hipLaunchKernelGGL((nested_init), + hipLaunchKernelGGL((nested_init), dim3(nblocks), dim3(nthreads_per_block), 0, 0, array, ni, nj, nk); hipErrchk( hipGetLastError() ); @@ -114,7 +120,7 @@ void NESTED_INIT::runHipVariantImpl(VariantID vid) NESTED_INIT_BODY; }; - hipLaunchKernelGGL((nested_init_lam< decltype(nested_init_lambda) >), + hipLaunchKernelGGL((nested_init_lam), dim3(nblocks), dim3(nthreads_per_block), 0, 0, ni, nj, nk, nested_init_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/basic/PI_ATOMIC-Cuda.cpp b/src/basic/PI_ATOMIC-Cuda.cpp index 268543099..c46aecec5 100644 --- a/src/basic/PI_ATOMIC-Cuda.cpp +++ b/src/basic/PI_ATOMIC-Cuda.cpp @@ -27,11 +27,13 @@ namespace basic #define PI_ATOMIC_DATA_TEARDOWN_CUDA \ deallocCudaDeviceData(pi); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void pi_atomic(Real_ptr pi, Real_type dx, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { double x = (double(i) + 0.5) * dx; RAJA::atomicAdd(pi, dx / (1.0 + x * x)); @@ -59,7 +61,7 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) initCudaDeviceData(pi, &m_pi_init, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - pi_atomic<<>>( pi, dx, iend ); + pi_atomic<<>>( pi, dx, iend ); cudaErrchk( cudaGetLastError() ); getCudaDeviceData(m_pi, pi, 1); @@ -80,7 +82,7 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) initCudaDeviceData(pi, &m_pi_init, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - lambda_cuda_forall<<>>( + lambda_cuda_forall<<>>( ibegin, iend, [=] __device__ (Index_type i) { double x = (double(i) + 0.5) * dx; RAJA::atomicAdd(pi, dx / (1.0 + x * x)); diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp index c1f720900..ce6152646 100644 --- a/src/basic/PI_ATOMIC-Hip.cpp +++ b/src/basic/PI_ATOMIC-Hip.cpp @@ -27,11 +27,13 @@ namespace basic #define PI_ATOMIC_DATA_TEARDOWN_HIP \ deallocHipDeviceData(pi); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void atomic_pi(Real_ptr pi, Real_type dx, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { double x = (double(i) + 0.5) * dx; RAJA::atomicAdd(pi, dx / (1.0 + x * x)); @@ -59,7 +61,7 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) initHipDeviceData(pi, &m_pi_init, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL(atomic_pi,grid_size, block_size, 0, 0, pi, dx, iend ); + hipLaunchKernelGGL((atomic_pi),grid_size, block_size, 0, 0, pi, dx, iend ); hipErrchk( hipGetLastError() ); getHipDeviceData(m_pi, pi, 1); @@ -85,7 +87,7 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL(lambda_hip_forall, + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, atomic_pi_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index 806690391..c03a1e3f9 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -21,22 +21,24 @@ namespace rajaperf namespace basic { +template < size_t block_size > +__launch_bounds__(block_size) __global__ void pi_reduce(Real_type dx, Real_ptr dpi, Real_type pi_init, Index_type iend) { extern __shared__ Real_type ppi[ ]; - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; ppi[ threadIdx.x ] = pi_init; - for ( ; i < iend ; i += gridDim.x * blockDim.x ) { + for ( ; i < iend ; i += gridDim.x * block_size ) { double x = (double(i) + 0.5) * dx; ppi[ threadIdx.x ] += dx / (1.0 + x * x); } __syncthreads(); - for ( i = blockDim.x / 2; i > 0; i /= 2 ) { + for ( i = block_size / 2; i > 0; i /= 2 ) { if ( threadIdx.x < i ) { ppi[ threadIdx.x ] += ppi[ threadIdx.x + i ]; } @@ -76,7 +78,7 @@ void PI_REDUCE::runCudaVariantImpl(VariantID vid) initCudaDeviceData(dpi, &m_pi_init, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - pi_reduce<<<<>>( dx, dpi, m_pi_init, iend ); diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index 6ece8f94e..7cfc0b068 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -21,22 +21,24 @@ namespace rajaperf namespace basic { +template < size_t block_size > +__launch_bounds__(block_size) __global__ void pi_reduce(Real_type dx, Real_ptr dpi, Real_type pi_init, Index_type iend) { HIP_DYNAMIC_SHARED(Real_type, ppi); - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; ppi[ threadIdx.x ] = pi_init; - for ( ; i < iend ; i += gridDim.x * blockDim.x ) { + for ( ; i < iend ; i += gridDim.x * block_size ) { double x = (double(i) + 0.5) * dx; ppi[ threadIdx.x ] += dx / (1.0 + x * x); } __syncthreads(); - for ( i = blockDim.x / 2; i > 0; i /= 2 ) { + for ( i = block_size / 2; i > 0; i /= 2 ) { if ( threadIdx.x < i ) { ppi[ threadIdx.x ] += ppi[ threadIdx.x + i ]; } @@ -76,7 +78,7 @@ void PI_REDUCE::runHipVariantImpl(VariantID vid) initHipDeviceData(dpi, &m_pi_init, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL( (pi_reduce), dim3(grid_size), dim3(block_size), + hipLaunchKernelGGL( (pi_reduce), dim3(grid_size), dim3(block_size), sizeof(Real_type)*block_size, 0, dx, dpi, m_pi_init, iend ); hipErrchk( hipGetLastError() ); diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index 2d63c6480..1acd73b63 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -28,6 +28,8 @@ namespace basic deallocCudaDeviceData(vec); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void reduce3int(Int_ptr vec, Int_ptr vsum, Int_type vsum_init, Int_ptr vmin, Int_type vmin_init, @@ -35,23 +37,23 @@ __global__ void reduce3int(Int_ptr vec, Index_type iend) { extern __shared__ Int_type psum[ ]; - Int_type* pmin = (Int_type*)&psum[ 1 * blockDim.x ]; - Int_type* pmax = (Int_type*)&psum[ 2 * blockDim.x ]; + Int_type* pmin = (Int_type*)&psum[ 1 * block_size ]; + Int_type* pmax = (Int_type*)&psum[ 2 * block_size ]; - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; psum[ threadIdx.x ] = vsum_init; pmin[ threadIdx.x ] = vmin_init; pmax[ threadIdx.x ] = vmax_init; - for ( ; i < iend ; i += gridDim.x * blockDim.x ) { + for ( ; i < iend ; i += gridDim.x * block_size ) { psum[ threadIdx.x ] += vec[ i ]; pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], vec[ i ] ); pmax[ threadIdx.x ] = RAJA_MAX( pmax[ threadIdx.x ], vec[ i ] ); } __syncthreads(); - for ( i = blockDim.x / 2; i > 0; i /= 2 ) { + for ( i = block_size / 2; i > 0; i /= 2 ) { if ( threadIdx.x < i ) { psum[ threadIdx.x ] += psum[ threadIdx.x + i ]; pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], pmin[ threadIdx.x + i ] ); @@ -106,7 +108,7 @@ void REDUCE3_INT::runCudaVariantImpl(VariantID vid) cudaMemcpyHostToDevice ) ); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - reduce3int<<<<>>(vec, vmem + 0, m_vsum_init, vmem + 1, m_vmin_init, diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index 36fa085ef..bd3a981b1 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -28,6 +28,8 @@ namespace basic deallocHipDeviceData(vec); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void reduce3int(Int_ptr vec, Int_ptr vsum, Int_type vsum_init, Int_ptr vmin, Int_type vmin_init, @@ -35,23 +37,23 @@ __global__ void reduce3int(Int_ptr vec, Index_type iend) { HIP_DYNAMIC_SHARED( Int_type, psum) - Int_type* pmin = (Int_type*)&psum[ 1 * blockDim.x ]; - Int_type* pmax = (Int_type*)&psum[ 2 * blockDim.x ]; + Int_type* pmin = (Int_type*)&psum[ 1 * block_size ]; + Int_type* pmax = (Int_type*)&psum[ 2 * block_size ]; - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; psum[ threadIdx.x ] = vsum_init; pmin[ threadIdx.x ] = vmin_init; pmax[ threadIdx.x ] = vmax_init; - for ( ; i < iend ; i += gridDim.x * blockDim.x ) { + for ( ; i < iend ; i += gridDim.x * block_size ) { psum[ threadIdx.x ] += vec[ i ]; pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], vec[ i ] ); pmax[ threadIdx.x ] = RAJA_MAX( pmax[ threadIdx.x ], vec[ i ] ); } __syncthreads(); - for ( i = blockDim.x / 2; i > 0; i /= 2 ) { + for ( i = block_size / 2; i > 0; i /= 2 ) { if ( threadIdx.x < i ) { psum[ threadIdx.x ] += psum[ threadIdx.x + i ]; pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], pmin[ threadIdx.x + i ] ); @@ -106,7 +108,7 @@ void REDUCE3_INT::runHipVariantImpl(VariantID vid) hipMemcpyHostToDevice ) ); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((reduce3int), dim3(grid_size), dim3(block_size), 3*sizeof(Int_type)*block_size, 0, + hipLaunchKernelGGL((reduce3int), dim3(grid_size), dim3(block_size), 3*sizeof(Int_type)*block_size, 0, vec, vmem + 0, m_vsum_init, vmem + 1, m_vmin_init, diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index 3cd5d467e..aef49f9d9 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -42,6 +42,8 @@ Real_type trap_int_func(Real_type x, #define TRAP_INT_DATA_TEARDOWN_CUDA // nothing to do here... +template < size_t block_size > +__launch_bounds__(block_size) __global__ void trapint(Real_type x0, Real_type xp, Real_type y, Real_type yp, Real_type h, @@ -50,17 +52,17 @@ __global__ void trapint(Real_type x0, Real_type xp, { extern __shared__ Real_type psumx[ ]; - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; psumx[ threadIdx.x ] = 0.0; - for ( ; i < iend ; i += gridDim.x * blockDim.x ) { + for ( ; i < iend ; i += gridDim.x * block_size ) { Real_type x = x0 + i*h; Real_type val = trap_int_func(x, y, xp, yp); psumx[ threadIdx.x ] += val; } __syncthreads(); - for ( i = blockDim.x / 2; i > 0; i /= 2 ) { + for ( i = block_size / 2; i > 0; i /= 2 ) { if ( threadIdx.x < i ) { psumx[ threadIdx.x ] += psumx[ threadIdx.x + i ]; } @@ -103,7 +105,7 @@ void TRAP_INT::runCudaVariantImpl(VariantID vid) initCudaDeviceData(sumx, &m_sumx_init, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - trapint<<<<>>(x0, xp, y, yp, h, diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index a50ec661f..899769763 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -42,6 +42,8 @@ Real_type trap_int_func(Real_type x, #define TRAP_INT_DATA_TEARDOWN_HIP // nothing to do here... +template < size_t block_size > +__launch_bounds__(block_size) __global__ void trapint(Real_type x0, Real_type xp, Real_type y, Real_type yp, Real_type h, @@ -50,17 +52,17 @@ __global__ void trapint(Real_type x0, Real_type xp, { HIP_DYNAMIC_SHARED( Real_type, psumx) - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; psumx[ threadIdx.x ] = 0.0; - for ( ; i < iend ; i += gridDim.x * blockDim.x ) { + for ( ; i < iend ; i += gridDim.x * block_size ) { Real_type x = x0 + i*h; Real_type val = trap_int_func(x, y, xp, yp); psumx[ threadIdx.x ] += val; } __syncthreads(); - for ( i = blockDim.x / 2; i > 0; i /= 2 ) { + for ( i = block_size / 2; i > 0; i /= 2 ) { if ( threadIdx.x < i ) { psumx[ threadIdx.x ] += psumx[ threadIdx.x + i ]; } @@ -103,7 +105,7 @@ void TRAP_INT::runHipVariantImpl(VariantID vid) initHipDeviceData(sumx, &m_sumx_init, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((trapint), dim3(grid_size), dim3(block_size), sizeof(Real_type)*block_size, 0, x0, xp, + hipLaunchKernelGGL((trapint), dim3(grid_size), dim3(block_size), sizeof(Real_type)*block_size, 0, x0, xp, y, yp, h, sumx, From 3b89907f2533858c8bd344d6b4dc9bec9032fecd Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 22 Oct 2021 12:47:03 -0700 Subject: [PATCH 144/392] Support block_size in stream kernels --- src/stream/ADD-Cuda.cpp | 18 +++++++++++------- src/stream/ADD-Hip.cpp | 18 +++++++++++------- src/stream/ADD.cpp | 12 +++++++++++- src/stream/ADD.hpp | 9 +++++++++ src/stream/COPY-Cuda.cpp | 17 ++++++++++------- src/stream/COPY-Hip.cpp | 17 ++++++++++------- src/stream/COPY.cpp | 12 +++++++++++- src/stream/COPY.hpp | 9 +++++++++ src/stream/MUL-Cuda.cpp | 19 ++++++++++++------- src/stream/MUL-Hip.cpp | 19 ++++++++++++------- src/stream/MUL.cpp | 12 +++++++++++- src/stream/MUL.hpp | 9 +++++++++ src/stream/TRIAD-Cuda.cpp | 17 ++++++++++------- src/stream/TRIAD-Hip.cpp | 17 ++++++++++------- src/stream/TRIAD.cpp | 12 +++++++++++- src/stream/TRIAD.hpp | 9 +++++++++ 16 files changed, 166 insertions(+), 60 deletions(-) diff --git a/src/stream/ADD-Cuda.cpp b/src/stream/ADD-Cuda.cpp index 4fc0a4f0e..dc4232bed 100644 --- a/src/stream/ADD-Cuda.cpp +++ b/src/stream/ADD-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace stream { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define ADD_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(a, m_a, iend); \ allocAndInitCudaDeviceData(b, m_b, iend); \ @@ -48,7 +42,8 @@ __global__ void add(Real_ptr c, Real_ptr a, Real_ptr b, } -void ADD::runCudaVariant(VariantID vid) +template < size_t block_size > +void ADD::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -114,6 +109,15 @@ void ADD::runCudaVariant(VariantID vid) } } +void ADD::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n ADD : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/ADD-Hip.cpp b/src/stream/ADD-Hip.cpp index 68b671a63..b2b3747db 100644 --- a/src/stream/ADD-Hip.cpp +++ b/src/stream/ADD-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace stream { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define ADD_DATA_SETUP_HIP \ allocAndInitHipDeviceData(a, m_a, iend); \ allocAndInitHipDeviceData(b, m_b, iend); \ @@ -48,7 +42,8 @@ __global__ void add(Real_ptr c, Real_ptr a, Real_ptr b, } -void ADD::runHipVariant(VariantID vid) +template < size_t block_size > +void ADD::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -116,6 +111,15 @@ void ADD::runHipVariant(VariantID vid) } } +void ADD::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n ADD : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp index 200172e60..5a58f976f 100644 --- a/src/stream/ADD.cpp +++ b/src/stream/ADD.cpp @@ -21,6 +21,10 @@ namespace stream ADD::ADD(const RunParams& params) : KernelBase(rajaperf::Stream_ADD, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(1000); @@ -28,7 +32,7 @@ ADD::ADD(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) * + setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() ); setFLOPsPerRep(1 * getActualProblemSize()); @@ -78,5 +82,11 @@ void ADD::tearDown(VariantID vid) deallocData(m_c); } +bool ADD::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/ADD.hpp b/src/stream/ADD.hpp index 0bf45b810..db621b309 100644 --- a/src/stream/ADD.hpp +++ b/src/stream/ADD.hpp @@ -53,7 +53,16 @@ class ADD : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_a; Real_ptr m_b; Real_ptr m_c; diff --git a/src/stream/COPY-Cuda.cpp b/src/stream/COPY-Cuda.cpp index 62afb9ad8..ccc882a59 100644 --- a/src/stream/COPY-Cuda.cpp +++ b/src/stream/COPY-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace stream { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define COPY_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(a, m_a, iend); \ allocAndInitCudaDeviceData(c, m_c, iend); @@ -46,7 +40,8 @@ __global__ void copy(Real_ptr c, Real_ptr a, } -void COPY::runCudaVariant(VariantID vid) +template < size_t block_size > +void COPY::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -110,7 +105,15 @@ void COPY::runCudaVariant(VariantID vid) } else { std::cout << "\n COPY : Unknown Cuda variant id = " << vid << std::endl; } +} +void COPY::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n COPY : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace stream diff --git a/src/stream/COPY-Hip.cpp b/src/stream/COPY-Hip.cpp index 124f880fc..53a8c3769 100644 --- a/src/stream/COPY-Hip.cpp +++ b/src/stream/COPY-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace stream { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define COPY_DATA_SETUP_HIP \ allocAndInitHipDeviceData(a, m_a, iend); \ allocAndInitHipDeviceData(c, m_c, iend); @@ -46,7 +40,8 @@ __global__ void copy(Real_ptr c, Real_ptr a, } -void COPY::runHipVariant(VariantID vid) +template < size_t block_size > +void COPY::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -112,7 +107,15 @@ void COPY::runHipVariant(VariantID vid) } else { std::cout << "\n COPY : Unknown Hip variant id = " << vid << std::endl; } +} +void COPY::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n COPY : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace stream diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp index d8c7ec1d6..6e05b68db 100644 --- a/src/stream/COPY.cpp +++ b/src/stream/COPY.cpp @@ -21,6 +21,10 @@ namespace stream COPY::COPY(const RunParams& params) : KernelBase(rajaperf::Stream_COPY, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(1800); @@ -28,7 +32,7 @@ COPY::COPY(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * + setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); setFLOPsPerRep(0); @@ -76,5 +80,11 @@ void COPY::tearDown(VariantID vid) deallocData(m_c); } +bool COPY::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/COPY.hpp b/src/stream/COPY.hpp index 010a391c8..1ae3f228c 100644 --- a/src/stream/COPY.hpp +++ b/src/stream/COPY.hpp @@ -52,7 +52,16 @@ class COPY : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_a; Real_ptr m_c; }; diff --git a/src/stream/MUL-Cuda.cpp b/src/stream/MUL-Cuda.cpp index 01ccf4956..caaf7e1a0 100644 --- a/src/stream/MUL-Cuda.cpp +++ b/src/stream/MUL-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace stream { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define MUL_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(b, m_b, iend); \ allocAndInitCudaDeviceData(c, m_c, iend); @@ -45,7 +39,9 @@ __global__ void mul(Real_ptr b, Real_ptr c, Real_type alpha, } } -void MUL::runCudaVariant(VariantID vid) + +template < size_t block_size > +void MUL::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -111,6 +107,15 @@ void MUL::runCudaVariant(VariantID vid) } } +void MUL::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n MUL : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/MUL-Hip.cpp b/src/stream/MUL-Hip.cpp index bdb5ca0eb..9b813a502 100644 --- a/src/stream/MUL-Hip.cpp +++ b/src/stream/MUL-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace stream { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define MUL_DATA_SETUP_HIP \ allocAndInitHipDeviceData(b, m_b, iend); \ allocAndInitHipDeviceData(c, m_c, iend); @@ -45,7 +39,9 @@ __global__ void mul(Real_ptr b, Real_ptr c, Real_type alpha, } } -void MUL::runHipVariant(VariantID vid) + +template < size_t block_size > +void MUL::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -113,6 +109,15 @@ void MUL::runHipVariant(VariantID vid) } } +void MUL::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n MUL : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp index 6b167de04..d6e4b5404 100644 --- a/src/stream/MUL.cpp +++ b/src/stream/MUL.cpp @@ -21,6 +21,10 @@ namespace stream MUL::MUL(const RunParams& params) : KernelBase(rajaperf::Stream_MUL, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(1800); @@ -28,7 +32,7 @@ MUL::MUL(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * + setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); setFLOPsPerRep(1 * getActualProblemSize()); @@ -77,5 +81,11 @@ void MUL::tearDown(VariantID vid) deallocData(m_c); } +bool MUL::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/MUL.hpp b/src/stream/MUL.hpp index f8fcefbcb..f97adc2ff 100644 --- a/src/stream/MUL.hpp +++ b/src/stream/MUL.hpp @@ -53,7 +53,16 @@ class MUL : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_b; Real_ptr m_c; Real_type m_alpha; diff --git a/src/stream/TRIAD-Cuda.cpp b/src/stream/TRIAD-Cuda.cpp index 19175d80d..2e07eb5eb 100644 --- a/src/stream/TRIAD-Cuda.cpp +++ b/src/stream/TRIAD-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace stream { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define TRIAD_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(a, m_a, iend); \ allocAndInitCudaDeviceData(b, m_b, iend); \ @@ -48,7 +42,8 @@ __global__ void triad(Real_ptr a, Real_ptr b, Real_ptr c, Real_type alpha, } -void TRIAD::runCudaVariant(VariantID vid) +template < size_t block_size > +void TRIAD::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -112,7 +107,15 @@ void TRIAD::runCudaVariant(VariantID vid) } else { std::cout << "\n TRIAD : Unknown Cuda variant id = " << vid << std::endl; } +} +void TRIAD::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n TRIAD : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace stream diff --git a/src/stream/TRIAD-Hip.cpp b/src/stream/TRIAD-Hip.cpp index deaf20d0f..2931e5b2d 100644 --- a/src/stream/TRIAD-Hip.cpp +++ b/src/stream/TRIAD-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace stream { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define TRIAD_DATA_SETUP_HIP \ allocAndInitHipDeviceData(a, m_a, iend); \ allocAndInitHipDeviceData(b, m_b, iend); \ @@ -48,7 +42,8 @@ __global__ void triad(Real_ptr a, Real_ptr b, Real_ptr c, Real_type alpha, } -void TRIAD::runHipVariant(VariantID vid) +template < size_t block_size > +void TRIAD::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -114,7 +109,15 @@ void TRIAD::runHipVariant(VariantID vid) } else { std::cout << "\n TRIAD : Unknown Hip variant id = " << vid << std::endl; } +} +void TRIAD::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n TRIAD : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace stream diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp index dfa04eda0..4d9a4aece 100644 --- a/src/stream/TRIAD.cpp +++ b/src/stream/TRIAD.cpp @@ -21,6 +21,10 @@ namespace stream TRIAD::TRIAD(const RunParams& params) : KernelBase(rajaperf::Stream_TRIAD, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(1000); @@ -28,7 +32,7 @@ TRIAD::TRIAD(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) * + setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() ); setFLOPsPerRep(2 * getActualProblemSize()); @@ -83,5 +87,11 @@ void TRIAD::tearDown(VariantID vid) deallocData(m_c); } +bool TRIAD::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/TRIAD.hpp b/src/stream/TRIAD.hpp index 8d2f01236..8c37b969c 100644 --- a/src/stream/TRIAD.hpp +++ b/src/stream/TRIAD.hpp @@ -54,7 +54,16 @@ class TRIAD : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_a; Real_ptr m_b; Real_ptr m_c; From ffd6a842ae2f83aee4ed167843f10cad8feab6d7 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 22 Oct 2021 12:56:53 -0700 Subject: [PATCH 145/392] Template stream gpu kernels on block_size --- src/stream/ADD-Cuda.cpp | 8 +++++--- src/stream/ADD-Hip.cpp | 8 +++++--- src/stream/COPY-Cuda.cpp | 8 +++++--- src/stream/COPY-Hip.cpp | 8 +++++--- src/stream/MUL-Cuda.cpp | 8 +++++--- src/stream/MUL-Hip.cpp | 8 +++++--- src/stream/TRIAD-Cuda.cpp | 8 +++++--- src/stream/TRIAD-Hip.cpp | 8 +++++--- 8 files changed, 40 insertions(+), 24 deletions(-) diff --git a/src/stream/ADD-Cuda.cpp b/src/stream/ADD-Cuda.cpp index dc4232bed..4569469d0 100644 --- a/src/stream/ADD-Cuda.cpp +++ b/src/stream/ADD-Cuda.cpp @@ -32,10 +32,12 @@ namespace stream deallocCudaDeviceData(b); \ deallocCudaDeviceData(c); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void add(Real_ptr c, Real_ptr a, Real_ptr b, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { ADD_BODY; } @@ -59,7 +61,7 @@ void ADD::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - add<<>>( c, a, b, + add<<>>( c, a, b, iend ); cudaErrchk( cudaGetLastError() ); @@ -76,7 +78,7 @@ void ADD::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - lambda_cuda_forall<<>>( + lambda_cuda_forall<<>>( ibegin, iend, [=] __device__ (Index_type i) { ADD_BODY; }); diff --git a/src/stream/ADD-Hip.cpp b/src/stream/ADD-Hip.cpp index b2b3747db..ad3acc5ab 100644 --- a/src/stream/ADD-Hip.cpp +++ b/src/stream/ADD-Hip.cpp @@ -32,10 +32,12 @@ namespace stream deallocHipDeviceData(b); \ deallocHipDeviceData(c); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void add(Real_ptr c, Real_ptr a, Real_ptr b, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { ADD_BODY; } @@ -59,7 +61,7 @@ void ADD::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((add), dim3(grid_size), dim3(block_size), 0, 0, c, a, b, + hipLaunchKernelGGL((add), dim3(grid_size), dim3(block_size), 0, 0, c, a, b, iend ); hipErrchk( hipGetLastError() ); @@ -80,7 +82,7 @@ void ADD::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL(lambda_hip_forall, + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, add_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/stream/COPY-Cuda.cpp b/src/stream/COPY-Cuda.cpp index ccc882a59..a71b69b05 100644 --- a/src/stream/COPY-Cuda.cpp +++ b/src/stream/COPY-Cuda.cpp @@ -30,10 +30,12 @@ namespace stream deallocCudaDeviceData(a); \ deallocCudaDeviceData(c); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void copy(Real_ptr c, Real_ptr a, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { COPY_BODY; } @@ -57,7 +59,7 @@ void COPY::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - copy<<>>( c, a, + copy<<>>( c, a, iend ); cudaErrchk( cudaGetLastError() ); @@ -74,7 +76,7 @@ void COPY::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - lambda_cuda_forall<<>>( + lambda_cuda_forall<<>>( ibegin, iend, [=] __device__ (Index_type i) { COPY_BODY; }); diff --git a/src/stream/COPY-Hip.cpp b/src/stream/COPY-Hip.cpp index 53a8c3769..23268f94f 100644 --- a/src/stream/COPY-Hip.cpp +++ b/src/stream/COPY-Hip.cpp @@ -30,10 +30,12 @@ namespace stream deallocHipDeviceData(a); \ deallocHipDeviceData(c); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void copy(Real_ptr c, Real_ptr a, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { COPY_BODY; } @@ -57,7 +59,7 @@ void COPY::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((copy), dim3(grid_size), dim3(block_size), 0, 0, + hipLaunchKernelGGL((copy), dim3(grid_size), dim3(block_size), 0, 0, c, a, iend ); hipErrchk( hipGetLastError() ); @@ -78,7 +80,7 @@ void COPY::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL(lambda_hip_forall, + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, copy_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/stream/MUL-Cuda.cpp b/src/stream/MUL-Cuda.cpp index caaf7e1a0..561c0307f 100644 --- a/src/stream/MUL-Cuda.cpp +++ b/src/stream/MUL-Cuda.cpp @@ -30,10 +30,12 @@ namespace stream deallocCudaDeviceData(b); \ deallocCudaDeviceData(c) +template < size_t block_size > +__launch_bounds__(block_size) __global__ void mul(Real_ptr b, Real_ptr c, Real_type alpha, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { MUL_BODY; } @@ -57,7 +59,7 @@ void MUL::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - mul<<>>( b, c, alpha, + mul<<>>( b, c, alpha, iend ); cudaErrchk( cudaGetLastError() ); @@ -74,7 +76,7 @@ void MUL::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - lambda_cuda_forall<<>>( + lambda_cuda_forall<<>>( ibegin, iend, [=] __device__ (Index_type i) { MUL_BODY; }); diff --git a/src/stream/MUL-Hip.cpp b/src/stream/MUL-Hip.cpp index 9b813a502..e90f6eea4 100644 --- a/src/stream/MUL-Hip.cpp +++ b/src/stream/MUL-Hip.cpp @@ -30,10 +30,12 @@ namespace stream deallocHipDeviceData(b); \ deallocHipDeviceData(c) +template < size_t block_size > +__launch_bounds__(block_size) __global__ void mul(Real_ptr b, Real_ptr c, Real_type alpha, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { MUL_BODY; } @@ -57,7 +59,7 @@ void MUL::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((mul), dim3(grid_size), dim3(block_size), 0, 0, b, c, alpha, + hipLaunchKernelGGL((mul), dim3(grid_size), dim3(block_size), 0, 0, b, c, alpha, iend ); hipErrchk( hipGetLastError() ); @@ -78,7 +80,7 @@ void MUL::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL(lambda_hip_forall, + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, mul_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/stream/TRIAD-Cuda.cpp b/src/stream/TRIAD-Cuda.cpp index 2e07eb5eb..86519a243 100644 --- a/src/stream/TRIAD-Cuda.cpp +++ b/src/stream/TRIAD-Cuda.cpp @@ -32,10 +32,12 @@ namespace stream deallocCudaDeviceData(b); \ deallocCudaDeviceData(c); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void triad(Real_ptr a, Real_ptr b, Real_ptr c, Real_type alpha, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { TRIAD_BODY; } @@ -59,7 +61,7 @@ void TRIAD::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - triad<<>>( a, b, c, alpha, + triad<<>>( a, b, c, alpha, iend ); cudaErrchk( cudaGetLastError() ); @@ -76,7 +78,7 @@ void TRIAD::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - lambda_cuda_forall<<>>( + lambda_cuda_forall<<>>( ibegin, iend, [=] __device__ (Index_type i) { TRIAD_BODY; }); diff --git a/src/stream/TRIAD-Hip.cpp b/src/stream/TRIAD-Hip.cpp index 2931e5b2d..dd3ce5f37 100644 --- a/src/stream/TRIAD-Hip.cpp +++ b/src/stream/TRIAD-Hip.cpp @@ -32,10 +32,12 @@ namespace stream deallocHipDeviceData(b); \ deallocHipDeviceData(c); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void triad(Real_ptr a, Real_ptr b, Real_ptr c, Real_type alpha, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { TRIAD_BODY; } @@ -59,7 +61,7 @@ void TRIAD::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((triad), dim3(grid_size), dim3(block_size), 0, 0, a, b, c, alpha, + hipLaunchKernelGGL((triad), dim3(grid_size), dim3(block_size), 0, 0, a, b, c, alpha, iend ); hipErrchk( hipGetLastError() ); @@ -80,7 +82,7 @@ void TRIAD::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL(lambda_hip_forall, + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, triad_lambda); hipErrchk( hipGetLastError() ); From 51ce890a6c860de38dcc1f3f363bc88696f7b6db Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sat, 23 Oct 2021 14:18:42 -0700 Subject: [PATCH 146/392] Support block size in lcals kernels --- src/lcals/DIFF_PREDICT-Cuda.cpp | 18 +++++++++++------- src/lcals/DIFF_PREDICT-Hip.cpp | 18 +++++++++++------- src/lcals/DIFF_PREDICT.cpp | 10 ++++++++++ src/lcals/DIFF_PREDICT.hpp | 9 +++++++++ src/lcals/EOS-Cuda.cpp | 18 +++++++++++------- src/lcals/EOS-Hip.cpp | 18 +++++++++++------- src/lcals/EOS.cpp | 10 ++++++++++ src/lcals/EOS.hpp | 9 +++++++++ src/lcals/FIRST_DIFF-Cuda.cpp | 18 +++++++++++------- src/lcals/FIRST_DIFF-Hip.cpp | 18 +++++++++++------- src/lcals/FIRST_DIFF.cpp | 10 ++++++++++ src/lcals/FIRST_DIFF.hpp | 9 +++++++++ src/lcals/FIRST_MIN-Cuda.cpp | 17 ++++++++++------- src/lcals/FIRST_MIN-Hip.cpp | 17 ++++++++++------- src/lcals/FIRST_MIN.cpp | 10 ++++++++++ src/lcals/FIRST_MIN.hpp | 9 +++++++++ src/lcals/FIRST_SUM-Cuda.cpp | 18 +++++++++++------- src/lcals/FIRST_SUM-Hip.cpp | 18 +++++++++++------- src/lcals/FIRST_SUM.cpp | 10 ++++++++++ src/lcals/FIRST_SUM.hpp | 9 +++++++++ src/lcals/GEN_LIN_RECUR-Cuda.cpp | 18 +++++++++++------- src/lcals/GEN_LIN_RECUR-Hip.cpp | 18 +++++++++++------- src/lcals/GEN_LIN_RECUR.cpp | 10 ++++++++++ src/lcals/GEN_LIN_RECUR.hpp | 9 +++++++++ src/lcals/HYDRO_1D-Cuda.cpp | 18 +++++++++++------- src/lcals/HYDRO_1D-Hip.cpp | 18 +++++++++++------- src/lcals/HYDRO_1D.cpp | 10 ++++++++++ src/lcals/HYDRO_1D.hpp | 9 +++++++++ src/lcals/HYDRO_2D-Cuda.cpp | 18 ++++++++++++++---- src/lcals/HYDRO_2D-Hip.cpp | 18 ++++++++++++++---- src/lcals/HYDRO_2D.cpp | 10 ++++++++++ src/lcals/HYDRO_2D.hpp | 9 +++++++++ src/lcals/INT_PREDICT-Cuda.cpp | 18 +++++++++++------- src/lcals/INT_PREDICT-Hip.cpp | 18 +++++++++++------- src/lcals/INT_PREDICT.cpp | 10 ++++++++++ src/lcals/INT_PREDICT.hpp | 9 +++++++++ src/lcals/PLANCKIAN-Cuda.cpp | 18 +++++++++++------- src/lcals/PLANCKIAN-Hip.cpp | 18 +++++++++++------- src/lcals/PLANCKIAN.cpp | 10 ++++++++++ src/lcals/PLANCKIAN.hpp | 9 +++++++++ src/lcals/TRIDIAG_ELIM-Cuda.cpp | 18 +++++++++++------- src/lcals/TRIDIAG_ELIM-Hip.cpp | 18 +++++++++++------- src/lcals/TRIDIAG_ELIM.cpp | 10 ++++++++++ src/lcals/TRIDIAG_ELIM.hpp | 9 +++++++++ 44 files changed, 455 insertions(+), 148 deletions(-) diff --git a/src/lcals/DIFF_PREDICT-Cuda.cpp b/src/lcals/DIFF_PREDICT-Cuda.cpp index 7be0908b4..3d0c2751a 100644 --- a/src/lcals/DIFF_PREDICT-Cuda.cpp +++ b/src/lcals/DIFF_PREDICT-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define DIFF_PREDICT_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(px, m_px, m_array_length); \ allocAndInitCudaDeviceData(cx, m_cx, m_array_length); @@ -47,7 +41,8 @@ __global__ void diff_predict(Real_ptr px, Real_ptr cx, } -void DIFF_PREDICT::runCudaVariant(VariantID vid) +template < size_t block_size > +void DIFF_PREDICT::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -95,6 +90,15 @@ void DIFF_PREDICT::runCudaVariant(VariantID vid) } } +void DIFF_PREDICT::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n DIFF_PREDICT : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/DIFF_PREDICT-Hip.cpp b/src/lcals/DIFF_PREDICT-Hip.cpp index 8bc38e983..71f9057c4 100644 --- a/src/lcals/DIFF_PREDICT-Hip.cpp +++ b/src/lcals/DIFF_PREDICT-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define DIFF_PREDICT_DATA_SETUP_HIP \ allocAndInitHipDeviceData(px, m_px, m_array_length); \ allocAndInitHipDeviceData(cx, m_cx, m_array_length); @@ -47,7 +41,8 @@ __global__ void diff_predict(Real_ptr px, Real_ptr cx, } -void DIFF_PREDICT::runHipVariant(VariantID vid) +template < size_t block_size > +void DIFF_PREDICT::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -95,6 +90,15 @@ void DIFF_PREDICT::runHipVariant(VariantID vid) } } +void DIFF_PREDICT::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n DIFF_PREDICT : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp index d1a96a101..42313cda0 100644 --- a/src/lcals/DIFF_PREDICT.cpp +++ b/src/lcals/DIFF_PREDICT.cpp @@ -20,6 +20,10 @@ namespace lcals DIFF_PREDICT::DIFF_PREDICT(const RunParams& params) : KernelBase(rajaperf::Lcals_DIFF_PREDICT, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(200); @@ -76,5 +80,11 @@ void DIFF_PREDICT::tearDown(VariantID vid) deallocData(m_cx); } +bool DIFF_PREDICT::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/DIFF_PREDICT.hpp b/src/lcals/DIFF_PREDICT.hpp index 504dd8bd7..6e5c0203c 100644 --- a/src/lcals/DIFF_PREDICT.hpp +++ b/src/lcals/DIFF_PREDICT.hpp @@ -94,7 +94,16 @@ class DIFF_PREDICT : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_px; Real_ptr m_cx; diff --git a/src/lcals/EOS-Cuda.cpp b/src/lcals/EOS-Cuda.cpp index c66a99545..3977a9ce6 100644 --- a/src/lcals/EOS-Cuda.cpp +++ b/src/lcals/EOS-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define EOS_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(x, m_x, m_array_length); \ allocAndInitCudaDeviceData(y, m_y, m_array_length); \ @@ -51,7 +45,8 @@ __global__ void eos(Real_ptr x, Real_ptr y, Real_ptr z, Real_ptr u, } -void EOS::runCudaVariant(VariantID vid) +template < size_t block_size > +void EOS::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -99,6 +94,15 @@ void EOS::runCudaVariant(VariantID vid) } } +void EOS::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n EOS : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/EOS-Hip.cpp b/src/lcals/EOS-Hip.cpp index 53f952a25..9f10966a6 100644 --- a/src/lcals/EOS-Hip.cpp +++ b/src/lcals/EOS-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define EOS_DATA_SETUP_HIP \ allocAndInitHipDeviceData(x, m_x, m_array_length); \ allocAndInitHipDeviceData(y, m_y, m_array_length); \ @@ -51,7 +45,8 @@ __global__ void eos(Real_ptr x, Real_ptr y, Real_ptr z, Real_ptr u, } -void EOS::runHipVariant(VariantID vid) +template < size_t block_size > +void EOS::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -99,6 +94,15 @@ void EOS::runHipVariant(VariantID vid) } } +void EOS::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n EOS : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp index 4a8671172..abd555e89 100644 --- a/src/lcals/EOS.cpp +++ b/src/lcals/EOS.cpp @@ -21,6 +21,10 @@ namespace lcals EOS::EOS(const RunParams& params) : KernelBase(rajaperf::Lcals_EOS, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(500); @@ -89,5 +93,11 @@ void EOS::tearDown(VariantID vid) deallocData(m_u); } +bool EOS::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/EOS.hpp b/src/lcals/EOS.hpp index 82a779ac2..5c4ac5ff1 100644 --- a/src/lcals/EOS.hpp +++ b/src/lcals/EOS.hpp @@ -63,7 +63,16 @@ class EOS : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_x; Real_ptr m_y; Real_ptr m_z; diff --git a/src/lcals/FIRST_DIFF-Cuda.cpp b/src/lcals/FIRST_DIFF-Cuda.cpp index f8330fdfc..65bb99ecf 100644 --- a/src/lcals/FIRST_DIFF-Cuda.cpp +++ b/src/lcals/FIRST_DIFF-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define FIRST_DIFF_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(x, m_x, m_N); \ allocAndInitCudaDeviceData(y, m_y, m_N); @@ -46,7 +40,8 @@ __global__ void first_diff(Real_ptr x, Real_ptr y, } -void FIRST_DIFF::runCudaVariant(VariantID vid) +template < size_t block_size > +void FIRST_DIFF::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -93,6 +88,15 @@ void FIRST_DIFF::runCudaVariant(VariantID vid) } } +void FIRST_DIFF::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n FIRST_DIFF : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_DIFF-Hip.cpp b/src/lcals/FIRST_DIFF-Hip.cpp index 4ac557fec..c581a6f9a 100644 --- a/src/lcals/FIRST_DIFF-Hip.cpp +++ b/src/lcals/FIRST_DIFF-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define FIRST_DIFF_DATA_SETUP_HIP \ allocAndInitHipDeviceData(x, m_x, m_N); \ allocAndInitHipDeviceData(y, m_y, m_N); @@ -46,7 +40,8 @@ __global__ void first_diff(Real_ptr x, Real_ptr y, } -void FIRST_DIFF::runHipVariant(VariantID vid) +template < size_t block_size > +void FIRST_DIFF::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -93,6 +88,15 @@ void FIRST_DIFF::runHipVariant(VariantID vid) } } +void FIRST_DIFF::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n FIRST_DIFF : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp index c37c41aac..aad34c42b 100644 --- a/src/lcals/FIRST_DIFF.cpp +++ b/src/lcals/FIRST_DIFF.cpp @@ -21,6 +21,10 @@ namespace lcals FIRST_DIFF::FIRST_DIFF(const RunParams& params) : KernelBase(rajaperf::Lcals_FIRST_DIFF, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(2000); @@ -77,5 +81,11 @@ void FIRST_DIFF::tearDown(VariantID vid) deallocData(m_y); } +bool FIRST_DIFF::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_DIFF.hpp b/src/lcals/FIRST_DIFF.hpp index 21c279b89..5f18a2037 100644 --- a/src/lcals/FIRST_DIFF.hpp +++ b/src/lcals/FIRST_DIFF.hpp @@ -53,7 +53,16 @@ class FIRST_DIFF : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_x; Real_ptr m_y; diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp index ef87159f2..4ce151f51 100644 --- a/src/lcals/FIRST_MIN-Cuda.cpp +++ b/src/lcals/FIRST_MIN-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define FIRST_MIN_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(x, m_x, m_N); @@ -66,7 +60,8 @@ __global__ void first_min(Real_ptr x, } -void FIRST_MIN::runCudaVariant(VariantID vid) +template < size_t block_size > +void FIRST_MIN::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -131,7 +126,15 @@ void FIRST_MIN::runCudaVariant(VariantID vid) } else { std::cout << "\n FIRST_MIN : Unknown Cuda variant id = " << vid << std::endl; } +} +void FIRST_MIN::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n FIRST_MIN : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace lcals diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp index 9880927e6..4d961ef9d 100644 --- a/src/lcals/FIRST_MIN-Hip.cpp +++ b/src/lcals/FIRST_MIN-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define FIRST_MIN_DATA_SETUP_HIP \ allocAndInitHipDeviceData(x, m_x, m_N); @@ -66,7 +60,8 @@ __global__ void first_min(Real_ptr x, } -void FIRST_MIN::runHipVariant(VariantID vid) +template < size_t block_size > +void FIRST_MIN::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -131,7 +126,15 @@ void FIRST_MIN::runHipVariant(VariantID vid) } else { std::cout << "\n FIRST_MIN : Unknown Hip variant id = " << vid << std::endl; } +} +void FIRST_MIN::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n FIRST_MIN : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace lcals diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp index c6138e46a..3546853a7 100644 --- a/src/lcals/FIRST_MIN.cpp +++ b/src/lcals/FIRST_MIN.cpp @@ -21,6 +21,10 @@ namespace lcals FIRST_MIN::FIRST_MIN(const RunParams& params) : KernelBase(rajaperf::Lcals_FIRST_MIN, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); //setDefaultReps(1000); // Set reps to low value until we resolve RAJA omp-target @@ -83,5 +87,11 @@ void FIRST_MIN::tearDown(VariantID vid) deallocData(m_x); } +bool FIRST_MIN::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_MIN.hpp b/src/lcals/FIRST_MIN.hpp index a9b48c1b3..ea111c0a8 100644 --- a/src/lcals/FIRST_MIN.hpp +++ b/src/lcals/FIRST_MIN.hpp @@ -82,7 +82,16 @@ class FIRST_MIN : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_x; Real_type m_xmin_init; Index_type m_initloc; diff --git a/src/lcals/FIRST_SUM-Cuda.cpp b/src/lcals/FIRST_SUM-Cuda.cpp index 55d0e2214..730b93bbe 100644 --- a/src/lcals/FIRST_SUM-Cuda.cpp +++ b/src/lcals/FIRST_SUM-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define FIRST_SUM_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(x, m_x, m_N); \ allocAndInitCudaDeviceData(y, m_y, m_N); @@ -46,7 +40,8 @@ __global__ void first_sum(Real_ptr x, Real_ptr y, } -void FIRST_SUM::runCudaVariant(VariantID vid) +template < size_t block_size > +void FIRST_SUM::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; @@ -93,6 +88,15 @@ void FIRST_SUM::runCudaVariant(VariantID vid) } } +void FIRST_SUM::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n FIRST_SUM : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_SUM-Hip.cpp b/src/lcals/FIRST_SUM-Hip.cpp index 0f2cb2ede..fc409acb2 100644 --- a/src/lcals/FIRST_SUM-Hip.cpp +++ b/src/lcals/FIRST_SUM-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define FIRST_SUM_DATA_SETUP_HIP \ allocAndInitHipDeviceData(x, m_x, m_N); \ allocAndInitHipDeviceData(y, m_y, m_N); @@ -46,7 +40,8 @@ __global__ void first_sum(Real_ptr x, Real_ptr y, } -void FIRST_SUM::runHipVariant(VariantID vid) +template < size_t block_size > +void FIRST_SUM::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; @@ -93,6 +88,15 @@ void FIRST_SUM::runHipVariant(VariantID vid) } } +void FIRST_SUM::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n FIRST_SUM : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp index ceaa9bc8b..38eff5416 100644 --- a/src/lcals/FIRST_SUM.cpp +++ b/src/lcals/FIRST_SUM.cpp @@ -21,6 +21,10 @@ namespace lcals FIRST_SUM::FIRST_SUM(const RunParams& params) : KernelBase(rajaperf::Lcals_FIRST_SUM, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(2000); @@ -76,5 +80,11 @@ void FIRST_SUM::tearDown(VariantID vid) deallocData(m_y); } +bool FIRST_SUM::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_SUM.hpp b/src/lcals/FIRST_SUM.hpp index d828ac896..da618e10d 100644 --- a/src/lcals/FIRST_SUM.hpp +++ b/src/lcals/FIRST_SUM.hpp @@ -56,7 +56,16 @@ class FIRST_SUM : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_x; Real_ptr m_y; diff --git a/src/lcals/GEN_LIN_RECUR-Cuda.cpp b/src/lcals/GEN_LIN_RECUR-Cuda.cpp index 3a8daa793..cd30ce76a 100644 --- a/src/lcals/GEN_LIN_RECUR-Cuda.cpp +++ b/src/lcals/GEN_LIN_RECUR-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define GEN_LIN_RECUR_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(b5, m_b5, m_N); \ allocAndInitCudaDeviceData(stb5, m_stb5, m_N); \ @@ -63,7 +57,8 @@ __global__ void genlinrecur2(Real_ptr b5, Real_ptr stb5, } -void GEN_LIN_RECUR::runCudaVariant(VariantID vid) +template < size_t block_size > +void GEN_LIN_RECUR::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -120,6 +115,15 @@ void GEN_LIN_RECUR::runCudaVariant(VariantID vid) } } +void GEN_LIN_RECUR::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n GEN_LIN_RECUR : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/GEN_LIN_RECUR-Hip.cpp b/src/lcals/GEN_LIN_RECUR-Hip.cpp index c11326881..c6752c21b 100644 --- a/src/lcals/GEN_LIN_RECUR-Hip.cpp +++ b/src/lcals/GEN_LIN_RECUR-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define GEN_LIN_RECUR_DATA_SETUP_HIP \ allocAndInitHipDeviceData(b5, m_b5, m_N); \ allocAndInitHipDeviceData(stb5, m_stb5, m_N); \ @@ -63,7 +57,8 @@ __global__ void genlinrecur2(Real_ptr b5, Real_ptr stb5, } -void GEN_LIN_RECUR::runHipVariant(VariantID vid) +template < size_t block_size > +void GEN_LIN_RECUR::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -122,6 +117,15 @@ void GEN_LIN_RECUR::runHipVariant(VariantID vid) } } +void GEN_LIN_RECUR::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n GEN_LIN_RECUR : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/GEN_LIN_RECUR.cpp b/src/lcals/GEN_LIN_RECUR.cpp index 6534633da..215bd3389 100644 --- a/src/lcals/GEN_LIN_RECUR.cpp +++ b/src/lcals/GEN_LIN_RECUR.cpp @@ -21,6 +21,10 @@ namespace lcals GEN_LIN_RECUR::GEN_LIN_RECUR(const RunParams& params) : KernelBase(rajaperf::Lcals_GEN_LIN_RECUR, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(500); @@ -87,5 +91,11 @@ void GEN_LIN_RECUR::tearDown(VariantID vid) deallocData(m_sb); } +bool GEN_LIN_RECUR::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/GEN_LIN_RECUR.hpp b/src/lcals/GEN_LIN_RECUR.hpp index 3fa49e69f..6cf271159 100644 --- a/src/lcals/GEN_LIN_RECUR.hpp +++ b/src/lcals/GEN_LIN_RECUR.hpp @@ -77,7 +77,16 @@ class GEN_LIN_RECUR : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_b5; Real_ptr m_sa; Real_ptr m_sb; diff --git a/src/lcals/HYDRO_1D-Cuda.cpp b/src/lcals/HYDRO_1D-Cuda.cpp index 74f102f5f..daffdd3b6 100644 --- a/src/lcals/HYDRO_1D-Cuda.cpp +++ b/src/lcals/HYDRO_1D-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define HYDRO_1D_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(x, m_x, m_array_length); \ allocAndInitCudaDeviceData(y, m_y, m_array_length); \ @@ -49,7 +43,8 @@ __global__ void hydro_1d(Real_ptr x, Real_ptr y, Real_ptr z, } -void HYDRO_1D::runCudaVariant(VariantID vid) +template < size_t block_size > +void HYDRO_1D::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -97,6 +92,15 @@ void HYDRO_1D::runCudaVariant(VariantID vid) } } +void HYDRO_1D::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n HYDRO_1D : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/HYDRO_1D-Hip.cpp b/src/lcals/HYDRO_1D-Hip.cpp index 6c06b2de0..7ea63c102 100644 --- a/src/lcals/HYDRO_1D-Hip.cpp +++ b/src/lcals/HYDRO_1D-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define HYDRO_1D_DATA_SETUP_HIP \ allocAndInitHipDeviceData(x, m_x, m_array_length); \ allocAndInitHipDeviceData(y, m_y, m_array_length); \ @@ -49,7 +43,8 @@ __global__ void hydro_1d(Real_ptr x, Real_ptr y, Real_ptr z, } -void HYDRO_1D::runHipVariant(VariantID vid) +template < size_t block_size > +void HYDRO_1D::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -97,6 +92,15 @@ void HYDRO_1D::runHipVariant(VariantID vid) } } +void HYDRO_1D::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n HYDRO_1D : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp index 08198ca0f..faf27fc7e 100644 --- a/src/lcals/HYDRO_1D.cpp +++ b/src/lcals/HYDRO_1D.cpp @@ -21,6 +21,10 @@ namespace lcals HYDRO_1D::HYDRO_1D(const RunParams& params) : KernelBase(rajaperf::Lcals_HYDRO_1D, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(1000); @@ -86,5 +90,11 @@ void HYDRO_1D::tearDown(VariantID vid) deallocData(m_z); } +bool HYDRO_1D::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/HYDRO_1D.hpp b/src/lcals/HYDRO_1D.hpp index 029065be8..27df3ecbe 100644 --- a/src/lcals/HYDRO_1D.hpp +++ b/src/lcals/HYDRO_1D.hpp @@ -58,7 +58,16 @@ class HYDRO_1D : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_x; Real_ptr m_y; Real_ptr m_z; diff --git a/src/lcals/HYDRO_2D-Cuda.cpp b/src/lcals/HYDRO_2D-Cuda.cpp index 2c0087358..de954ae3e 100644 --- a/src/lcals/HYDRO_2D-Cuda.cpp +++ b/src/lcals/HYDRO_2D-Cuda.cpp @@ -22,10 +22,10 @@ namespace lcals { // - // Define thread block size for CUDA execution + // Define thread block shape for CUDA execution // - constexpr size_t j_block_sz = 32; - constexpr size_t k_block_sz = 8; +#define j_block_sz (32) +#define k_block_sz (block_size / j_block_sz) #define HYDRO_2D_THREADS_PER_BLOCK_CUDA \ dim3 nthreads_per_block(j_block_sz, k_block_sz, 1); @@ -108,7 +108,8 @@ __global__ void hydro_2d3(Real_ptr zroutdat, Real_ptr zzoutdat, } -void HYDRO_2D::runCudaVariant(VariantID vid) +template < size_t block_size > +void HYDRO_2D::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type kbeg = 1; @@ -207,6 +208,15 @@ void HYDRO_2D::runCudaVariant(VariantID vid) } } +void HYDRO_2D::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n HYDRO_2D : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/HYDRO_2D-Hip.cpp b/src/lcals/HYDRO_2D-Hip.cpp index a492999ec..764be9e05 100644 --- a/src/lcals/HYDRO_2D-Hip.cpp +++ b/src/lcals/HYDRO_2D-Hip.cpp @@ -22,10 +22,10 @@ namespace lcals { // - // Define thread block size for Hip execution + // Define thread block shape for Hip execution // - constexpr size_t j_block_sz = 32; - constexpr size_t k_block_sz = 8; +#define j_block_sz (32) +#define k_block_sz (block_size / j_block_sz) #define HYDRO_2D_THREADS_PER_BLOCK_HIP \ dim3 nthreads_per_block(j_block_sz, k_block_sz, 1); @@ -107,7 +107,8 @@ __global__ void hydro_2d3(Real_ptr zroutdat, Real_ptr zzoutdat, } -void HYDRO_2D::runHipVariant(VariantID vid) +template < size_t block_size > +void HYDRO_2D::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type kbeg = 1; @@ -212,6 +213,15 @@ void HYDRO_2D::runHipVariant(VariantID vid) } } +void HYDRO_2D::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n HYDRO_2D : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp index e51237f82..f1b2347f8 100644 --- a/src/lcals/HYDRO_2D.cpp +++ b/src/lcals/HYDRO_2D.cpp @@ -24,6 +24,10 @@ namespace lcals HYDRO_2D::HYDRO_2D(const RunParams& params) : KernelBase(rajaperf::Lcals_HYDRO_2D, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + m_jn = 1000; m_kn = 1000; @@ -114,5 +118,11 @@ void HYDRO_2D::tearDown(VariantID vid) deallocData(m_zz); } +bool HYDRO_2D::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/HYDRO_2D.hpp b/src/lcals/HYDRO_2D.hpp index 2525c8c89..cd62b66f9 100644 --- a/src/lcals/HYDRO_2D.hpp +++ b/src/lcals/HYDRO_2D.hpp @@ -154,7 +154,16 @@ class HYDRO_2D : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_za; Real_ptr m_zb; Real_ptr m_zm; diff --git a/src/lcals/INT_PREDICT-Cuda.cpp b/src/lcals/INT_PREDICT-Cuda.cpp index 651f5f862..a3f18e722 100644 --- a/src/lcals/INT_PREDICT-Cuda.cpp +++ b/src/lcals/INT_PREDICT-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define INT_PREDICT_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(px, m_px, m_array_length); @@ -48,7 +42,8 @@ __global__ void int_predict(Real_ptr px, } -void INT_PREDICT::runCudaVariant(VariantID vid) +template < size_t block_size > +void INT_PREDICT::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -98,6 +93,15 @@ void INT_PREDICT::runCudaVariant(VariantID vid) } } +void INT_PREDICT::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n INT_PREDICT : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/INT_PREDICT-Hip.cpp b/src/lcals/INT_PREDICT-Hip.cpp index 8d758f70d..0775ce672 100644 --- a/src/lcals/INT_PREDICT-Hip.cpp +++ b/src/lcals/INT_PREDICT-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define INT_PREDICT_DATA_SETUP_HIP \ allocAndInitHipDeviceData(px, m_px, m_array_length); @@ -48,7 +42,8 @@ __global__ void int_predict(Real_ptr px, } -void INT_PREDICT::runHipVariant(VariantID vid) +template < size_t block_size > +void INT_PREDICT::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -98,6 +93,15 @@ void INT_PREDICT::runHipVariant(VariantID vid) } } +void INT_PREDICT::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n INT_PREDICT : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp index 096a074ac..898866e90 100644 --- a/src/lcals/INT_PREDICT.cpp +++ b/src/lcals/INT_PREDICT.cpp @@ -21,6 +21,10 @@ namespace lcals INT_PREDICT::INT_PREDICT(const RunParams& params) : KernelBase(rajaperf::Lcals_INT_PREDICT, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(400); @@ -88,5 +92,11 @@ void INT_PREDICT::tearDown(VariantID vid) deallocData(m_px); } +bool INT_PREDICT::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/INT_PREDICT.hpp b/src/lcals/INT_PREDICT.hpp index 1253e1a6e..eab03a090 100644 --- a/src/lcals/INT_PREDICT.hpp +++ b/src/lcals/INT_PREDICT.hpp @@ -73,7 +73,16 @@ class INT_PREDICT : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Index_type m_array_length; Index_type m_offset; diff --git a/src/lcals/PLANCKIAN-Cuda.cpp b/src/lcals/PLANCKIAN-Cuda.cpp index fd46a4fdf..d9817220a 100644 --- a/src/lcals/PLANCKIAN-Cuda.cpp +++ b/src/lcals/PLANCKIAN-Cuda.cpp @@ -22,12 +22,6 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define PLANCKIAN_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(x, m_x, iend); \ allocAndInitCudaDeviceData(y, m_y, iend); \ @@ -54,7 +48,8 @@ __global__ void planckian(Real_ptr x, Real_ptr y, } -void PLANCKIAN::runCudaVariant(VariantID vid) +template < size_t block_size > +void PLANCKIAN::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -102,6 +97,15 @@ void PLANCKIAN::runCudaVariant(VariantID vid) } } +void PLANCKIAN::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n PLANCKIAN : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/PLANCKIAN-Hip.cpp b/src/lcals/PLANCKIAN-Hip.cpp index f47d04ce9..dc5898bd9 100644 --- a/src/lcals/PLANCKIAN-Hip.cpp +++ b/src/lcals/PLANCKIAN-Hip.cpp @@ -22,12 +22,6 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define PLANCKIAN_DATA_SETUP_HIP \ allocAndInitHipDeviceData(x, m_x, iend); \ allocAndInitHipDeviceData(y, m_y, iend); \ @@ -54,7 +48,8 @@ __global__ void planckian(Real_ptr x, Real_ptr y, } -void PLANCKIAN::runHipVariant(VariantID vid) +template < size_t block_size > +void PLANCKIAN::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -102,6 +97,15 @@ void PLANCKIAN::runHipVariant(VariantID vid) } } +void PLANCKIAN::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n PLANCKIAN : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp index 564a71a7e..683eec841 100644 --- a/src/lcals/PLANCKIAN.cpp +++ b/src/lcals/PLANCKIAN.cpp @@ -21,6 +21,10 @@ namespace lcals PLANCKIAN::PLANCKIAN(const RunParams& params) : KernelBase(rajaperf::Lcals_PLANCKIAN, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(50); @@ -79,5 +83,11 @@ void PLANCKIAN::tearDown(VariantID vid) deallocData(m_w); } +bool PLANCKIAN::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/PLANCKIAN.hpp b/src/lcals/PLANCKIAN.hpp index 1e5b744db..3a7493a06 100644 --- a/src/lcals/PLANCKIAN.hpp +++ b/src/lcals/PLANCKIAN.hpp @@ -58,7 +58,16 @@ class PLANCKIAN : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_x; Real_ptr m_y; Real_ptr m_u; diff --git a/src/lcals/TRIDIAG_ELIM-Cuda.cpp b/src/lcals/TRIDIAG_ELIM-Cuda.cpp index b06884f0e..7760b489b 100644 --- a/src/lcals/TRIDIAG_ELIM-Cuda.cpp +++ b/src/lcals/TRIDIAG_ELIM-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define TRIDIAG_ELIM_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(xout, m_xout, m_N); \ allocAndInitCudaDeviceData(xin, m_xin, m_N); \ @@ -50,7 +44,8 @@ __global__ void eos(Real_ptr xout, Real_ptr xin, Real_ptr y, Real_ptr z, } -void TRIDIAG_ELIM::runCudaVariant(VariantID vid) +template < size_t block_size > +void TRIDIAG_ELIM::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; @@ -97,6 +92,15 @@ void TRIDIAG_ELIM::runCudaVariant(VariantID vid) } } +void TRIDIAG_ELIM::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n TRIDIAG_ELIM : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/TRIDIAG_ELIM-Hip.cpp b/src/lcals/TRIDIAG_ELIM-Hip.cpp index 66ae4bad5..b21797516 100644 --- a/src/lcals/TRIDIAG_ELIM-Hip.cpp +++ b/src/lcals/TRIDIAG_ELIM-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define TRIDIAG_ELIM_DATA_SETUP_HIP \ allocAndInitHipDeviceData(xout, m_xout, m_N); \ allocAndInitHipDeviceData(xin, m_xin, m_N); \ @@ -50,7 +44,8 @@ __global__ void eos(Real_ptr xout, Real_ptr xin, Real_ptr y, Real_ptr z, } -void TRIDIAG_ELIM::runHipVariant(VariantID vid) +template < size_t block_size > +void TRIDIAG_ELIM::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; @@ -97,6 +92,15 @@ void TRIDIAG_ELIM::runHipVariant(VariantID vid) } } +void TRIDIAG_ELIM::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n TRIDIAG_ELIM : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp index d35c08a51..872e15703 100644 --- a/src/lcals/TRIDIAG_ELIM.cpp +++ b/src/lcals/TRIDIAG_ELIM.cpp @@ -21,6 +21,10 @@ namespace lcals TRIDIAG_ELIM::TRIDIAG_ELIM(const RunParams& params) : KernelBase(rajaperf::Lcals_TRIDIAG_ELIM, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(1000); @@ -79,5 +83,11 @@ void TRIDIAG_ELIM::tearDown(VariantID vid) deallocData(m_z); } +bool TRIDIAG_ELIM::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/TRIDIAG_ELIM.hpp b/src/lcals/TRIDIAG_ELIM.hpp index 73ffeb341..24a6524f7 100644 --- a/src/lcals/TRIDIAG_ELIM.hpp +++ b/src/lcals/TRIDIAG_ELIM.hpp @@ -58,7 +58,16 @@ class TRIDIAG_ELIM : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_xout; Real_ptr m_xin; Real_ptr m_y; From d336510b9ef892bb647d43d489a407333d69ec6e Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sat, 23 Oct 2021 14:20:20 -0700 Subject: [PATCH 147/392] Template gpu kernels in lcals --- src/lcals/DIFF_PREDICT-Cuda.cpp | 6 +++-- src/lcals/DIFF_PREDICT-Hip.cpp | 6 +++-- src/lcals/EOS-Cuda.cpp | 6 +++-- src/lcals/EOS-Hip.cpp | 6 +++-- src/lcals/FIRST_DIFF-Cuda.cpp | 6 +++-- src/lcals/FIRST_DIFF-Hip.cpp | 6 +++-- src/lcals/FIRST_MIN-Cuda.cpp | 10 ++++---- src/lcals/FIRST_MIN-Hip.cpp | 10 ++++---- src/lcals/FIRST_SUM-Cuda.cpp | 6 +++-- src/lcals/FIRST_SUM-Hip.cpp | 6 +++-- src/lcals/GEN_LIN_RECUR-Cuda.cpp | 12 ++++++---- src/lcals/GEN_LIN_RECUR-Hip.cpp | 12 ++++++---- src/lcals/HYDRO_1D-Cuda.cpp | 6 +++-- src/lcals/HYDRO_1D-Hip.cpp | 6 +++-- src/lcals/HYDRO_2D-Cuda.cpp | 41 +++++++++++++++++++++----------- src/lcals/HYDRO_2D-Hip.cpp | 30 +++++++++++++++-------- src/lcals/INT_PREDICT-Cuda.cpp | 6 +++-- src/lcals/INT_PREDICT-Hip.cpp | 6 +++-- src/lcals/PLANCKIAN-Cuda.cpp | 6 +++-- src/lcals/PLANCKIAN-Hip.cpp | 6 +++-- src/lcals/TRIDIAG_ELIM-Cuda.cpp | 7 ++++-- src/lcals/TRIDIAG_ELIM-Hip.cpp | 6 +++-- 22 files changed, 140 insertions(+), 72 deletions(-) diff --git a/src/lcals/DIFF_PREDICT-Cuda.cpp b/src/lcals/DIFF_PREDICT-Cuda.cpp index 3d0c2751a..09a73204b 100644 --- a/src/lcals/DIFF_PREDICT-Cuda.cpp +++ b/src/lcals/DIFF_PREDICT-Cuda.cpp @@ -30,11 +30,13 @@ namespace lcals deallocCudaDeviceData(px); \ deallocCudaDeviceData(cx); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void diff_predict(Real_ptr px, Real_ptr cx, const Index_type offset, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { DIFF_PREDICT_BODY; } @@ -58,7 +60,7 @@ void DIFF_PREDICT::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - diff_predict<<>>( px, cx, + diff_predict<<>>( px, cx, offset, iend ); cudaErrchk( cudaGetLastError() ); diff --git a/src/lcals/DIFF_PREDICT-Hip.cpp b/src/lcals/DIFF_PREDICT-Hip.cpp index 71f9057c4..6816c3b3d 100644 --- a/src/lcals/DIFF_PREDICT-Hip.cpp +++ b/src/lcals/DIFF_PREDICT-Hip.cpp @@ -30,11 +30,13 @@ namespace lcals deallocHipDeviceData(px); \ deallocHipDeviceData(cx); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void diff_predict(Real_ptr px, Real_ptr cx, const Index_type offset, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { DIFF_PREDICT_BODY; } @@ -58,7 +60,7 @@ void DIFF_PREDICT::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((diff_predict), dim3(grid_size), dim3(block_size), 0, 0, px, cx, + hipLaunchKernelGGL((diff_predict), dim3(grid_size), dim3(block_size), 0, 0, px, cx, offset, iend ); hipErrchk( hipGetLastError() ); diff --git a/src/lcals/EOS-Cuda.cpp b/src/lcals/EOS-Cuda.cpp index 3977a9ce6..036b2bac0 100644 --- a/src/lcals/EOS-Cuda.cpp +++ b/src/lcals/EOS-Cuda.cpp @@ -34,11 +34,13 @@ namespace lcals deallocCudaDeviceData(z); \ deallocCudaDeviceData(u); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void eos(Real_ptr x, Real_ptr y, Real_ptr z, Real_ptr u, Real_type q, Real_type r, Real_type t, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { EOS_BODY; } @@ -62,7 +64,7 @@ void EOS::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - eos<<>>( x, y, z, u, + eos<<>>( x, y, z, u, q, r, t, iend ); cudaErrchk( cudaGetLastError() ); diff --git a/src/lcals/EOS-Hip.cpp b/src/lcals/EOS-Hip.cpp index 9f10966a6..e747106a6 100644 --- a/src/lcals/EOS-Hip.cpp +++ b/src/lcals/EOS-Hip.cpp @@ -34,11 +34,13 @@ namespace lcals deallocHipDeviceData(z); \ deallocHipDeviceData(u); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void eos(Real_ptr x, Real_ptr y, Real_ptr z, Real_ptr u, Real_type q, Real_type r, Real_type t, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { EOS_BODY; } @@ -62,7 +64,7 @@ void EOS::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((eos), dim3(grid_size), dim3(block_size), 0, 0, x, y, z, u, + hipLaunchKernelGGL((eos), dim3(grid_size), dim3(block_size), 0, 0, x, y, z, u, q, r, t, iend ); hipErrchk( hipGetLastError() ); diff --git a/src/lcals/FIRST_DIFF-Cuda.cpp b/src/lcals/FIRST_DIFF-Cuda.cpp index 65bb99ecf..523be432a 100644 --- a/src/lcals/FIRST_DIFF-Cuda.cpp +++ b/src/lcals/FIRST_DIFF-Cuda.cpp @@ -30,10 +30,12 @@ namespace lcals deallocCudaDeviceData(x); \ deallocCudaDeviceData(y); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void first_diff(Real_ptr x, Real_ptr y, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { FIRST_DIFF_BODY; } @@ -57,7 +59,7 @@ void FIRST_DIFF::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - first_diff<<>>( x, y, + first_diff<<>>( x, y, iend ); cudaErrchk( cudaGetLastError() ); diff --git a/src/lcals/FIRST_DIFF-Hip.cpp b/src/lcals/FIRST_DIFF-Hip.cpp index c581a6f9a..94896f23e 100644 --- a/src/lcals/FIRST_DIFF-Hip.cpp +++ b/src/lcals/FIRST_DIFF-Hip.cpp @@ -30,10 +30,12 @@ namespace lcals deallocHipDeviceData(x); \ deallocHipDeviceData(y); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void first_diff(Real_ptr x, Real_ptr y, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { FIRST_DIFF_BODY; } @@ -57,7 +59,7 @@ void FIRST_DIFF::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((first_diff), dim3(grid_size), dim3(block_size), 0, 0, x, y, + hipLaunchKernelGGL((first_diff), dim3(grid_size), dim3(block_size), 0, 0, x, y, iend ); hipErrchk( hipGetLastError() ); diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp index 4ce151f51..373c4ae17 100644 --- a/src/lcals/FIRST_MIN-Cuda.cpp +++ b/src/lcals/FIRST_MIN-Cuda.cpp @@ -27,23 +27,25 @@ namespace lcals #define FIRST_MIN_DATA_TEARDOWN_CUDA \ deallocCudaDeviceData(x); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void first_min(Real_ptr x, MyMinLoc* dminloc, Index_type iend) { extern __shared__ MyMinLoc minloc[ ]; - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; minloc[ threadIdx.x ] = *dminloc; - for ( ; i < iend ; i += gridDim.x * blockDim.x ) { + for ( ; i < iend ; i += gridDim.x * block_size ) { MyMinLoc& mymin = minloc[ threadIdx.x ]; FIRST_MIN_BODY; } __syncthreads(); - for ( i = blockDim.x / 2; i > 0; i /= 2 ) { + for ( i = block_size / 2; i > 0; i /= 2 ) { if ( threadIdx.x < i ) { if ( minloc[ threadIdx.x + i].val < minloc[ threadIdx.x ].val ) { minloc[ threadIdx.x ] = minloc[ threadIdx.x + i]; @@ -84,7 +86,7 @@ void FIRST_MIN::runCudaVariantImpl(VariantID vid) cudaMemcpyHostToDevice ) ); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - first_min<<<<>>( x, dminloc, iend ); diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp index 4d961ef9d..d2220ad39 100644 --- a/src/lcals/FIRST_MIN-Hip.cpp +++ b/src/lcals/FIRST_MIN-Hip.cpp @@ -27,23 +27,25 @@ namespace lcals #define FIRST_MIN_DATA_TEARDOWN_HIP \ deallocHipDeviceData(x); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void first_min(Real_ptr x, MyMinLoc* dminloc, Index_type iend) { extern __shared__ MyMinLoc minloc[ ]; - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; minloc[ threadIdx.x ] = *dminloc; - for ( ; i < iend ; i += gridDim.x * blockDim.x ) { + for ( ; i < iend ; i += gridDim.x * block_size ) { MyMinLoc& mymin = minloc[ threadIdx.x ]; FIRST_MIN_BODY; } __syncthreads(); - for ( i = blockDim.x / 2; i > 0; i /= 2 ) { + for ( i = block_size / 2; i > 0; i /= 2 ) { if ( threadIdx.x < i ) { if ( minloc[ threadIdx.x + i].val < minloc[ threadIdx.x ].val ) { minloc[ threadIdx.x ] = minloc[ threadIdx.x + i]; @@ -84,7 +86,7 @@ void FIRST_MIN::runHipVariantImpl(VariantID vid) hipMemcpyHostToDevice ) ); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL(first_min, grid_size, block_size, + hipLaunchKernelGGL((first_min), grid_size, block_size, sizeof(MyMinLoc)*block_size, 0, x, dminloc, iend ); diff --git a/src/lcals/FIRST_SUM-Cuda.cpp b/src/lcals/FIRST_SUM-Cuda.cpp index 730b93bbe..86839f3b7 100644 --- a/src/lcals/FIRST_SUM-Cuda.cpp +++ b/src/lcals/FIRST_SUM-Cuda.cpp @@ -30,10 +30,12 @@ namespace lcals deallocCudaDeviceData(x); \ deallocCudaDeviceData(y); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void first_sum(Real_ptr x, Real_ptr y, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i > 0 && i < iend) { FIRST_SUM_BODY; } @@ -57,7 +59,7 @@ void FIRST_SUM::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - first_sum<<>>( x, y, + first_sum<<>>( x, y, iend ); cudaErrchk( cudaGetLastError() ); diff --git a/src/lcals/FIRST_SUM-Hip.cpp b/src/lcals/FIRST_SUM-Hip.cpp index fc409acb2..7709d0d50 100644 --- a/src/lcals/FIRST_SUM-Hip.cpp +++ b/src/lcals/FIRST_SUM-Hip.cpp @@ -30,10 +30,12 @@ namespace lcals deallocHipDeviceData(x); \ deallocHipDeviceData(y); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void first_sum(Real_ptr x, Real_ptr y, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i > 0 && i < iend) { FIRST_SUM_BODY; } @@ -57,7 +59,7 @@ void FIRST_SUM::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL(first_sum,grid_size, block_size, 0, 0, x, y, + hipLaunchKernelGGL((first_sum),grid_size, block_size, 0, 0, x, y, iend ); hipErrchk( hipGetLastError() ); diff --git a/src/lcals/GEN_LIN_RECUR-Cuda.cpp b/src/lcals/GEN_LIN_RECUR-Cuda.cpp index cd30ce76a..79fb4bc1e 100644 --- a/src/lcals/GEN_LIN_RECUR-Cuda.cpp +++ b/src/lcals/GEN_LIN_RECUR-Cuda.cpp @@ -34,23 +34,27 @@ namespace lcals deallocCudaDeviceData(sa); \ deallocCudaDeviceData(sb); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void genlinrecur1(Real_ptr b5, Real_ptr stb5, Real_ptr sa, Real_ptr sb, Index_type kb5i, Index_type N) { - Index_type k = blockIdx.x * blockDim.x + threadIdx.x; + Index_type k = blockIdx.x * block_size + threadIdx.x; if (k < N) { GEN_LIN_RECUR_BODY1; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void genlinrecur2(Real_ptr b5, Real_ptr stb5, Real_ptr sa, Real_ptr sb, Index_type kb5i, Index_type N) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i > 0 && i < N+1) { GEN_LIN_RECUR_BODY2; } @@ -72,13 +76,13 @@ void GEN_LIN_RECUR::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(N, block_size); - genlinrecur1<<>>( b5, stb5, sa, sb, + genlinrecur1<<>>( b5, stb5, sa, sb, kb5i, N ); cudaErrchk( cudaGetLastError() ); const size_t grid_size2 = RAJA_DIVIDE_CEILING_INT(N+1, block_size); - genlinrecur2<<>>( b5, stb5, sa, sb, + genlinrecur2<<>>( b5, stb5, sa, sb, kb5i, N ); cudaErrchk( cudaGetLastError() ); diff --git a/src/lcals/GEN_LIN_RECUR-Hip.cpp b/src/lcals/GEN_LIN_RECUR-Hip.cpp index c6752c21b..35c9d902b 100644 --- a/src/lcals/GEN_LIN_RECUR-Hip.cpp +++ b/src/lcals/GEN_LIN_RECUR-Hip.cpp @@ -34,23 +34,27 @@ namespace lcals deallocHipDeviceData(sa); \ deallocHipDeviceData(sb); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void genlinrecur1(Real_ptr b5, Real_ptr stb5, Real_ptr sa, Real_ptr sb, Index_type kb5i, Index_type N) { - Index_type k = blockIdx.x * blockDim.x + threadIdx.x; + Index_type k = blockIdx.x * block_size + threadIdx.x; if (k < N) { GEN_LIN_RECUR_BODY1; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void genlinrecur2(Real_ptr b5, Real_ptr stb5, Real_ptr sa, Real_ptr sb, Index_type kb5i, Index_type N) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i > 0 && i < N+1) { GEN_LIN_RECUR_BODY2; } @@ -72,14 +76,14 @@ void GEN_LIN_RECUR::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(N, block_size); - hipLaunchKernelGGL(genlinrecur1, grid_size1, block_size, 0, 0, + hipLaunchKernelGGL((genlinrecur1), grid_size1, block_size, 0, 0, b5, stb5, sa, sb, kb5i, N ); hipErrchk( hipGetLastError() ); const size_t grid_size2 = RAJA_DIVIDE_CEILING_INT(N+1, block_size); - hipLaunchKernelGGL(genlinrecur2, grid_size2, block_size, 0, 0, + hipLaunchKernelGGL((genlinrecur2), grid_size2, block_size, 0, 0, b5, stb5, sa, sb, kb5i, N ); diff --git a/src/lcals/HYDRO_1D-Cuda.cpp b/src/lcals/HYDRO_1D-Cuda.cpp index daffdd3b6..60085fdcb 100644 --- a/src/lcals/HYDRO_1D-Cuda.cpp +++ b/src/lcals/HYDRO_1D-Cuda.cpp @@ -32,11 +32,13 @@ namespace lcals deallocCudaDeviceData(y); \ deallocCudaDeviceData(z); \ +template < size_t block_size > +__launch_bounds__(block_size) __global__ void hydro_1d(Real_ptr x, Real_ptr y, Real_ptr z, Real_type q, Real_type r, Real_type t, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { HYDRO_1D_BODY; } @@ -60,7 +62,7 @@ void HYDRO_1D::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hydro_1d<<>>( x, y, z, + hydro_1d<<>>( x, y, z, q, r, t, iend ); cudaErrchk( cudaGetLastError() ); diff --git a/src/lcals/HYDRO_1D-Hip.cpp b/src/lcals/HYDRO_1D-Hip.cpp index 7ea63c102..d11153d14 100644 --- a/src/lcals/HYDRO_1D-Hip.cpp +++ b/src/lcals/HYDRO_1D-Hip.cpp @@ -32,11 +32,13 @@ namespace lcals deallocHipDeviceData(y); \ deallocHipDeviceData(z); \ +template < size_t block_size > +__launch_bounds__(block_size) __global__ void hydro_1d(Real_ptr x, Real_ptr y, Real_ptr z, Real_type q, Real_type r, Real_type t, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { HYDRO_1D_BODY; } @@ -60,7 +62,7 @@ void HYDRO_1D::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((hydro_1d), dim3(grid_size), dim3(block_size), 0, 0, x, y, z, + hipLaunchKernelGGL((hydro_1d), dim3(grid_size), dim3(block_size), 0, 0, x, y, z, q, r, t, iend ); hipErrchk( hipGetLastError() ); diff --git a/src/lcals/HYDRO_2D-Cuda.cpp b/src/lcals/HYDRO_2D-Cuda.cpp index de954ae3e..fc2566aaf 100644 --- a/src/lcals/HYDRO_2D-Cuda.cpp +++ b/src/lcals/HYDRO_2D-Cuda.cpp @@ -27,8 +27,12 @@ namespace lcals #define j_block_sz (32) #define k_block_sz (block_size / j_block_sz) +#define HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \ + j_block_sz, k_block_sz + #define HYDRO_2D_THREADS_PER_BLOCK_CUDA \ - dim3 nthreads_per_block(j_block_sz, k_block_sz, 1); + dim3 nthreads_per_block(HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA, 1); \ + static_assert(j_block_sz*k_block_sz == block_size, "Invalid block_size"); #define HYDRO_2D_NBLOCKS_CUDA \ dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(jn-2, j_block_sz)), \ @@ -66,41 +70,47 @@ namespace lcals deallocCudaDeviceData(zroutdat); \ deallocCudaDeviceData(zzoutdat); +template < size_t j_block_size, size_t k_block_size > +__launch_bounds__(j_block_size*k_block_size) __global__ void hydro_2d1(Real_ptr zadat, Real_ptr zbdat, Real_ptr zpdat, Real_ptr zqdat, Real_ptr zrdat, Real_ptr zmdat, Index_type jn, Index_type kn) { - Index_type k = 1 + blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type k = 1 + blockIdx.y * k_block_size + threadIdx.y; + Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x; if (k < kn-1 && j < jn-1) { HYDRO_2D_BODY1; } } +template < size_t j_block_size, size_t k_block_size > +__launch_bounds__(j_block_size*k_block_size) __global__ void hydro_2d2(Real_ptr zudat, Real_ptr zvdat, Real_ptr zadat, Real_ptr zbdat, Real_ptr zzdat, Real_ptr zrdat, Real_type s, Index_type jn, Index_type kn) { - Index_type k = 1 + blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type k = 1 + blockIdx.y * k_block_size + threadIdx.y; + Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x; if (k < kn-1 && j < jn-1) { HYDRO_2D_BODY2; } } +template < size_t j_block_size, size_t k_block_size > +__launch_bounds__(j_block_size*k_block_size) __global__ void hydro_2d3(Real_ptr zroutdat, Real_ptr zzoutdat, Real_ptr zrdat, Real_ptr zudat, Real_ptr zzdat, Real_ptr zvdat, Real_type t, Index_type jn, Index_type kn) { - Index_type k = 1 + blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type k = 1 + blockIdx.y * k_block_size + threadIdx.y; + Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x; if (k < kn-1 && j < jn-1) { HYDRO_2D_BODY3; @@ -128,19 +138,22 @@ void HYDRO_2D::runCudaVariantImpl(VariantID vid) HYDRO_2D_THREADS_PER_BLOCK_CUDA; HYDRO_2D_NBLOCKS_CUDA; - - hydro_2d1<<>>(zadat, zbdat, + + hydro_2d1 + <<>>(zadat, zbdat, zpdat, zqdat, zrdat, zmdat, jn, kn); cudaErrchk( cudaGetLastError() ); - hydro_2d2<<>>(zudat, zvdat, + hydro_2d2 + <<>>(zudat, zvdat, zadat, zbdat, zzdat, zrdat, s, jn, kn); cudaErrchk( cudaGetLastError() ); - hydro_2d3<<>>(zroutdat, zzoutdat, + hydro_2d3 + <<>>(zroutdat, zzoutdat, zrdat, zudat, zzdat, zvdat, t, jn, kn); @@ -160,9 +173,9 @@ void HYDRO_2D::runCudaVariantImpl(VariantID vid) using EXECPOL = RAJA::KernelPolicy< RAJA::statement::CudaKernelFixedAsync, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_y_direct, - RAJA::statement::Tile<1, RAJA::tile_fixed, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_x_direct, RAJA::statement::For<0, RAJA::cuda_thread_y_direct, // k RAJA::statement::For<1, RAJA::cuda_thread_x_direct, // j @@ -173,7 +186,7 @@ void HYDRO_2D::runCudaVariantImpl(VariantID vid) > > >; - + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/lcals/HYDRO_2D-Hip.cpp b/src/lcals/HYDRO_2D-Hip.cpp index 764be9e05..209107a5b 100644 --- a/src/lcals/HYDRO_2D-Hip.cpp +++ b/src/lcals/HYDRO_2D-Hip.cpp @@ -27,8 +27,12 @@ namespace lcals #define j_block_sz (32) #define k_block_sz (block_size / j_block_sz) +#define HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \ + j_block_sz, k_block_sz + #define HYDRO_2D_THREADS_PER_BLOCK_HIP \ - dim3 nthreads_per_block(j_block_sz, k_block_sz, 1); + dim3 nthreads_per_block(HYDRO_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, 1); \ + static_assert(j_block_sz*k_block_sz == block_size, "Invalid block_size"); #define HYDRO_2D_NBLOCKS_HIP \ dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(jn-2, j_block_sz)), \ @@ -65,41 +69,47 @@ namespace lcals deallocHipDeviceData(zroutdat); \ deallocHipDeviceData(zzoutdat); +template < size_t j_block_size, size_t k_block_size > +__launch_bounds__(j_block_size*k_block_size) __global__ void hydro_2d1(Real_ptr zadat, Real_ptr zbdat, Real_ptr zpdat, Real_ptr zqdat, Real_ptr zrdat, Real_ptr zmdat, Index_type jn, Index_type kn) { - Index_type k = 1 + blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type k = 1 + blockIdx.y * k_block_size + threadIdx.y; + Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x; if (k < kn-1 && j < jn-1) { HYDRO_2D_BODY1; } } +template < size_t j_block_size, size_t k_block_size > +__launch_bounds__(j_block_size*k_block_size) __global__ void hydro_2d2(Real_ptr zudat, Real_ptr zvdat, Real_ptr zadat, Real_ptr zbdat, Real_ptr zzdat, Real_ptr zrdat, Real_type s, Index_type jn, Index_type kn) { - Index_type k = 1 + blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type k = 1 + blockIdx.y * k_block_size + threadIdx.y; + Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x; if (k < kn-1 && j < jn-1) { HYDRO_2D_BODY2; } } +template < size_t j_block_size, size_t k_block_size > +__launch_bounds__(j_block_size*k_block_size) __global__ void hydro_2d3(Real_ptr zroutdat, Real_ptr zzoutdat, Real_ptr zrdat, Real_ptr zudat, Real_ptr zzdat, Real_ptr zvdat, Real_type t, Index_type jn, Index_type kn) { - Index_type k = 1 + blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type k = 1 + blockIdx.y * k_block_size + threadIdx.y; + Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x; if (k < kn-1 && j < jn-1) { HYDRO_2D_BODY3; @@ -128,14 +138,14 @@ void HYDRO_2D::runHipVariantImpl(VariantID vid) HYDRO_2D_THREADS_PER_BLOCK_HIP; HYDRO_2D_NBLOCKS_HIP; - hipLaunchKernelGGL((hydro_2d1), + hipLaunchKernelGGL((hydro_2d1), dim3(nblocks), dim3(nthreads_per_block), 0, 0, zadat, zbdat, zpdat, zqdat, zrdat, zmdat, jn, kn); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((hydro_2d2), + hipLaunchKernelGGL((hydro_2d2), dim3(nblocks), dim3(nthreads_per_block), 0, 0, zudat, zvdat, zadat, zbdat, zzdat, zrdat, @@ -143,7 +153,7 @@ void HYDRO_2D::runHipVariantImpl(VariantID vid) jn, kn); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((hydro_2d3), + hipLaunchKernelGGL((hydro_2d3), dim3(nblocks), dim3(nthreads_per_block), 0, 0, zroutdat, zzoutdat, zrdat, zudat, zzdat, zvdat, diff --git a/src/lcals/INT_PREDICT-Cuda.cpp b/src/lcals/INT_PREDICT-Cuda.cpp index a3f18e722..b0123fa41 100644 --- a/src/lcals/INT_PREDICT-Cuda.cpp +++ b/src/lcals/INT_PREDICT-Cuda.cpp @@ -28,6 +28,8 @@ namespace lcals getCudaDeviceData(m_px, px, m_array_length); \ deallocCudaDeviceData(px); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void int_predict(Real_ptr px, Real_type dm22, Real_type dm23, Real_type dm24, Real_type dm25, Real_type dm26, Real_type dm27, @@ -35,7 +37,7 @@ __global__ void int_predict(Real_ptr px, const Index_type offset, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { INT_PREDICT_BODY; } @@ -59,7 +61,7 @@ void INT_PREDICT::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - int_predict<<>>( px, + int_predict<<>>( px, dm22, dm23, dm24, dm25, dm26, dm27, dm28, c0, offset, diff --git a/src/lcals/INT_PREDICT-Hip.cpp b/src/lcals/INT_PREDICT-Hip.cpp index 0775ce672..a9fd3ba85 100644 --- a/src/lcals/INT_PREDICT-Hip.cpp +++ b/src/lcals/INT_PREDICT-Hip.cpp @@ -28,6 +28,8 @@ namespace lcals getHipDeviceData(m_px, px, m_array_length); \ deallocHipDeviceData(px); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void int_predict(Real_ptr px, Real_type dm22, Real_type dm23, Real_type dm24, Real_type dm25, Real_type dm26, Real_type dm27, @@ -35,7 +37,7 @@ __global__ void int_predict(Real_ptr px, const Index_type offset, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { INT_PREDICT_BODY; } @@ -59,7 +61,7 @@ void INT_PREDICT::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((int_predict), dim3(grid_size), dim3(block_size), 0, 0, px, + hipLaunchKernelGGL((int_predict), dim3(grid_size), dim3(block_size), 0, 0, px, dm22, dm23, dm24, dm25, dm26, dm27, dm28, c0, offset, diff --git a/src/lcals/PLANCKIAN-Cuda.cpp b/src/lcals/PLANCKIAN-Cuda.cpp index d9817220a..f6d866d8b 100644 --- a/src/lcals/PLANCKIAN-Cuda.cpp +++ b/src/lcals/PLANCKIAN-Cuda.cpp @@ -37,11 +37,13 @@ namespace lcals deallocCudaDeviceData(v); \ deallocCudaDeviceData(w); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void planckian(Real_ptr x, Real_ptr y, Real_ptr u, Real_ptr v, Real_ptr w, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { PLANCKIAN_BODY; } @@ -65,7 +67,7 @@ void PLANCKIAN::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - planckian<<>>( x, y, + planckian<<>>( x, y, u, v, w, iend ); cudaErrchk( cudaGetLastError() ); diff --git a/src/lcals/PLANCKIAN-Hip.cpp b/src/lcals/PLANCKIAN-Hip.cpp index dc5898bd9..41d3a7cd4 100644 --- a/src/lcals/PLANCKIAN-Hip.cpp +++ b/src/lcals/PLANCKIAN-Hip.cpp @@ -37,11 +37,13 @@ namespace lcals deallocHipDeviceData(v); \ deallocHipDeviceData(w); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void planckian(Real_ptr x, Real_ptr y, Real_ptr u, Real_ptr v, Real_ptr w, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { PLANCKIAN_BODY; } @@ -65,7 +67,7 @@ void PLANCKIAN::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((planckian), dim3(grid_size), dim3(block_size), 0, 0, x, y, + hipLaunchKernelGGL((planckian), dim3(grid_size), dim3(block_size), 0, 0, x, y, u, v, w, iend ); hipErrchk( hipGetLastError() ); diff --git a/src/lcals/TRIDIAG_ELIM-Cuda.cpp b/src/lcals/TRIDIAG_ELIM-Cuda.cpp index 7760b489b..a97b40183 100644 --- a/src/lcals/TRIDIAG_ELIM-Cuda.cpp +++ b/src/lcals/TRIDIAG_ELIM-Cuda.cpp @@ -34,10 +34,12 @@ namespace lcals deallocCudaDeviceData(y); \ deallocCudaDeviceData(z); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void eos(Real_ptr xout, Real_ptr xin, Real_ptr y, Real_ptr z, Index_type N) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i > 0 && i < N) { TRIDIAG_ELIM_BODY; } @@ -61,7 +63,8 @@ void TRIDIAG_ELIM::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - eos<<>>( xout, xin, y, z, + eos + <<>>( xout, xin, y, z, iend ); cudaErrchk( cudaGetLastError() ); diff --git a/src/lcals/TRIDIAG_ELIM-Hip.cpp b/src/lcals/TRIDIAG_ELIM-Hip.cpp index b21797516..73e3329b2 100644 --- a/src/lcals/TRIDIAG_ELIM-Hip.cpp +++ b/src/lcals/TRIDIAG_ELIM-Hip.cpp @@ -34,10 +34,12 @@ namespace lcals deallocHipDeviceData(y); \ deallocHipDeviceData(z); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void eos(Real_ptr xout, Real_ptr xin, Real_ptr y, Real_ptr z, Index_type N) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i > 0 && i < N) { TRIDIAG_ELIM_BODY; } @@ -61,7 +63,7 @@ void TRIDIAG_ELIM::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL(eos, grid_size, block_size, 0, 0, xout, xin, y, z, + hipLaunchKernelGGL((eos), grid_size, block_size, 0, 0, xout, xin, y, z, iend ); hipErrchk( hipGetLastError() ); From 083d253e3bd8807bbb45dee273ed097a6fb870de Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sat, 23 Oct 2021 15:46:25 -0700 Subject: [PATCH 148/392] Support block_size in polybench kernels --- src/polybench/POLYBENCH_2MM-Cuda.cpp | 19 +++++++--- src/polybench/POLYBENCH_2MM-Hip.cpp | 25 +++++++++---- src/polybench/POLYBENCH_2MM.cpp | 16 ++++++-- src/polybench/POLYBENCH_2MM.hpp | 9 +++++ src/polybench/POLYBENCH_3MM-Cuda.cpp | 17 +++++++-- src/polybench/POLYBENCH_3MM-Hip.cpp | 27 +++++++++----- src/polybench/POLYBENCH_3MM.cpp | 18 +++++++-- src/polybench/POLYBENCH_3MM.hpp | 9 +++++ src/polybench/POLYBENCH_ADI-Cuda.cpp | 17 ++++++--- src/polybench/POLYBENCH_ADI-Hip.cpp | 21 +++++++---- src/polybench/POLYBENCH_ADI.cpp | 14 ++++++- src/polybench/POLYBENCH_ADI.hpp | 9 +++++ src/polybench/POLYBENCH_ATAX-Cuda.cpp | 16 +++++--- src/polybench/POLYBENCH_ATAX-Hip.cpp | 20 ++++++---- src/polybench/POLYBENCH_ATAX.cpp | 14 ++++++- src/polybench/POLYBENCH_ATAX.hpp | 9 +++++ src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp | 23 ++++++++---- src/polybench/POLYBENCH_FDTD_2D-Hip.cpp | 37 +++++++++++-------- src/polybench/POLYBENCH_FDTD_2D.cpp | 14 ++++++- src/polybench/POLYBENCH_FDTD_2D.hpp | 9 +++++ .../POLYBENCH_FLOYD_WARSHALL-Cuda.cpp | 23 ++++++++---- .../POLYBENCH_FLOYD_WARSHALL-Hip.cpp | 21 ++++++++--- src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp | 12 +++++- src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp | 9 +++++ src/polybench/POLYBENCH_GEMM-Cuda.cpp | 19 +++++++--- src/polybench/POLYBENCH_GEMM-Hip.cpp | 25 +++++++++---- src/polybench/POLYBENCH_GEMM.cpp | 12 +++++- src/polybench/POLYBENCH_GEMM.hpp | 9 +++++ src/polybench/POLYBENCH_GEMVER-Cuda.cpp | 21 +++++++---- src/polybench/POLYBENCH_GEMVER-Hip.cpp | 31 ++++++++++------ src/polybench/POLYBENCH_GEMVER.cpp | 10 +++++ src/polybench/POLYBENCH_GEMVER.hpp | 13 ++++++- src/polybench/POLYBENCH_GESUMMV-Cuda.cpp | 16 +++++--- src/polybench/POLYBENCH_GESUMMV-Hip.cpp | 18 +++++---- src/polybench/POLYBENCH_GESUMMV.cpp | 12 +++++- src/polybench/POLYBENCH_GESUMMV.hpp | 9 +++++ src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp | 16 +++++--- src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp | 16 +++++--- src/polybench/POLYBENCH_JACOBI_1D.cpp | 24 ++++++++---- src/polybench/POLYBENCH_JACOBI_1D.hpp | 9 +++++ src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp | 19 +++++++--- src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp | 21 ++++++++--- src/polybench/POLYBENCH_JACOBI_2D.cpp | 10 +++++ src/polybench/POLYBENCH_JACOBI_2D.hpp | 9 +++++ src/polybench/POLYBENCH_MVT-Cuda.cpp | 16 +++++--- src/polybench/POLYBENCH_MVT-Hip.cpp | 16 +++++--- src/polybench/POLYBENCH_MVT.cpp | 10 +++++ src/polybench/POLYBENCH_MVT.hpp | 9 +++++ 48 files changed, 583 insertions(+), 195 deletions(-) diff --git a/src/polybench/POLYBENCH_2MM-Cuda.cpp b/src/polybench/POLYBENCH_2MM-Cuda.cpp index f6165d74c..1186cdb8f 100644 --- a/src/polybench/POLYBENCH_2MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_2MM-Cuda.cpp @@ -22,10 +22,10 @@ namespace polybench { // -// Define thread block size for CUDA execution +// Define thread block shape for CUDA execution // -constexpr size_t out_block_sz = 8; -constexpr size_t in_block_sz = 32; +#define in_block_sz (32) +#define out_block_sz (block_size / in_block_sz) #define POLY_2MM_THREADS_PER_BLOCK_CUDA \ dim3 nthreads_per_block(in_block_sz, out_block_sz, 1); @@ -82,7 +82,7 @@ __global__ void poly_2mm_1_lam(Index_type ni, Index_type nj, Index_type j = blockIdx.x * blockDim.x + threadIdx.x; if ( i < ni && j < nj ) { - body(i, j); + body(i, j); } } @@ -115,7 +115,8 @@ __global__ void poly_2mm_2_lam(Index_type ni, Index_type nl, } -void POLYBENCH_2MM::runCudaVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_2MM::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -259,7 +260,15 @@ void POLYBENCH_2MM::runCudaVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_2MM : Unknown Cuda variant id = " << vid << std::endl; } +} +void POLYBENCH_2MM::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_2MM : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_2MM-Hip.cpp b/src/polybench/POLYBENCH_2MM-Hip.cpp index c3c9869b4..10b3eee61 100644 --- a/src/polybench/POLYBENCH_2MM-Hip.cpp +++ b/src/polybench/POLYBENCH_2MM-Hip.cpp @@ -22,10 +22,10 @@ namespace polybench { // -// Define thread block size for Hip execution +// Define thread block shape for Hip execution // -constexpr size_t out_block_sz = 8; -constexpr size_t in_block_sz = 32; +#define in_block_sz (32) +#define out_block_sz (block_size / in_block_sz) #define POLY_2MM_THREADS_PER_BLOCK_HIP \ dim3 nthreads_per_block(in_block_sz, out_block_sz, 1); @@ -114,7 +114,8 @@ __global__ void poly_2mm_2_lam(Index_type ni, Index_type nl, } -void POLYBENCH_2MM::runHipVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -130,14 +131,14 @@ void POLYBENCH_2MM::runHipVariant(VariantID vid) POLY_2MM_THREADS_PER_BLOCK_HIP; POLY_2MM_1_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_2mm_1), + hipLaunchKernelGGL((poly_2mm_1), dim3(nblocks1), dim3(nthreads_per_block), 0, 0, tmp, A, B, alpha, ni, nj, nk); hipErrchk( hipGetLastError() ); POLY_2MM_2_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_2mm_2), + hipLaunchKernelGGL((poly_2mm_2), dim3(nblocks2), dim3(nthreads_per_block), 0, 0, tmp, C, D, beta, ni, nl, nj); @@ -165,12 +166,12 @@ void POLYBENCH_2MM::runHipVariant(VariantID vid) POLYBENCH_2MM_BODY3; }; - POLY_2MM_1_NBLOCKS_HIP; + POLY_2MM_1_NBLOCKS_HIP; hipLaunchKernelGGL((poly_2mm_1_lam), dim3(nblocks1), dim3(nthreads_per_block), 0, 0, ni, nj, poly_2mm_1_lambda); hipErrchk( hipGetLastError() ); - + auto poly_2mm_2_lambda = [=] __device__ (Index_type i, Index_type l) { POLYBENCH_2MM_BODY4; for (Index_type j=0; j < nj; ++j) { @@ -266,7 +267,15 @@ void POLYBENCH_2MM::runHipVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_2MM : Unknown Hip variant id = " << vid << std::endl; } +} +void POLYBENCH_2MM::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_2MM : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index 7e2083c50..367de1871 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -22,12 +22,16 @@ namespace polybench POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params) : KernelBase(rajaperf::Polybench_2MM, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + Index_type ni_default = 1000; Index_type nj_default = 1000; Index_type nk_default = 1120; Index_type nl_default = 1000; - setDefaultProblemSize( std::max( ni_default*nj_default, + setDefaultProblemSize( std::max( ni_default*nj_default, ni_default*nl_default ) ); setDefaultReps(2); @@ -54,10 +58,10 @@ POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params) setFLOPsPerRep(3 * m_ni*m_nj*m_nk + 2 * m_ni*m_nj*m_nl ); - checksum_scale_factor = 0.000001 * + checksum_scale_factor = 0.000001 * ( static_cast(getDefaultProblemSize()) / getActualProblemSize() ); - + setUsesFeature(Kernel); setVariantDefined( Base_Seq ); @@ -109,5 +113,11 @@ void POLYBENCH_2MM::tearDown(VariantID vid) deallocData(m_D); } +bool POLYBENCH_2MM::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_2MM.hpp b/src/polybench/POLYBENCH_2MM.hpp index 897eb13a3..654a3e306 100644 --- a/src/polybench/POLYBENCH_2MM.hpp +++ b/src/polybench/POLYBENCH_2MM.hpp @@ -128,7 +128,16 @@ class POLYBENCH_2MM : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Index_type m_ni; Index_type m_nj; Index_type m_nk; diff --git a/src/polybench/POLYBENCH_3MM-Cuda.cpp b/src/polybench/POLYBENCH_3MM-Cuda.cpp index 956efb427..e9df41d90 100644 --- a/src/polybench/POLYBENCH_3MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_3MM-Cuda.cpp @@ -22,10 +22,10 @@ namespace polybench { // -// Define thread block size for CUDA execution +// Define thread block shape for CUDA execution // -constexpr size_t out_block_sz = 8; -constexpr size_t in_block_sz = 32; +#define in_block_sz (32) +#define out_block_sz (block_size / in_block_sz) #define POLY_3MM_THREADS_PER_BLOCK_CUDA \ dim3 nthreads_per_block(in_block_sz, out_block_sz, 1); @@ -149,7 +149,8 @@ __global__ void poly_3mm_3_lam(Index_type ni, Index_type nl, -void POLYBENCH_3MM::runCudaVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -332,7 +333,15 @@ void POLYBENCH_3MM::runCudaVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_3MM : Unknown Cuda variant id = " << vid << std::endl; } +} +void POLYBENCH_3MM::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_3MM : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_3MM-Hip.cpp b/src/polybench/POLYBENCH_3MM-Hip.cpp index 51e8ac53f..db3df8404 100644 --- a/src/polybench/POLYBENCH_3MM-Hip.cpp +++ b/src/polybench/POLYBENCH_3MM-Hip.cpp @@ -22,10 +22,10 @@ namespace polybench { // -// Define thread block size for Hip execution +// Define thread block shape for Hip execution // -constexpr size_t out_block_sz = 8; -constexpr size_t in_block_sz = 32; +#define in_block_sz (32) +#define out_block_sz (block_size / in_block_sz) #define POLY_3MM_THREADS_PER_BLOCK_HIP \ dim3 nthreads_per_block(in_block_sz, out_block_sz, 1); @@ -148,7 +148,8 @@ __global__ void poly_3mm_3_lam(Index_type ni, Index_type nl, } -void POLYBENCH_3MM::runHipVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -163,22 +164,22 @@ void POLYBENCH_3MM::runHipVariant(VariantID vid) POLY_3MM_THREADS_PER_BLOCK_HIP; - POLY_3MM_1_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_1), + POLY_3MM_1_NBLOCKS_HIP; + hipLaunchKernelGGL((poly_3mm_1), dim3(nblocks1) , dim3(nthreads_per_block), 0, 0, E, A, B, ni, nj, nk); hipErrchk( hipGetLastError() ); POLY_3MM_2_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_2), + hipLaunchKernelGGL((poly_3mm_2), dim3(nblocks2), dim3(nthreads_per_block), 0, 0, F, C, D, nj, nl, nm); hipErrchk( hipGetLastError() ); POLY_3MM_3_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_3), + hipLaunchKernelGGL((poly_3mm_3), dim3(nblocks3), dim3(nthreads_per_block), 0, 0, G, E, F, ni, nl, nj); @@ -270,7 +271,7 @@ void POLYBENCH_3MM::runHipVariant(VariantID vid) > > > - >; + >; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -343,7 +344,15 @@ void POLYBENCH_3MM::runHipVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_3MM : Unknown Hip variant id = " << vid << std::endl; } +} +void POLYBENCH_3MM::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_3MM : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp index 2c06a72ac..10bf9e6fb 100644 --- a/src/polybench/POLYBENCH_3MM.cpp +++ b/src/polybench/POLYBENCH_3MM.cpp @@ -23,14 +23,18 @@ namespace polybench POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params) : KernelBase(rajaperf::Polybench_3MM, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + Index_type ni_default = 1000; Index_type nj_default = 1000; Index_type nk_default = 1010; Index_type nl_default = 1000; Index_type nm_default = 1200; - setDefaultProblemSize( std::max( std::max( ni_default*nj_default, - nj_default*nl_default ), + setDefaultProblemSize( std::max( std::max( ni_default*nj_default, + nj_default*nl_default ), ni_default*nl_default ) ); setDefaultProblemSize( ni_default * nj_default ); setDefaultReps(2); @@ -42,7 +46,7 @@ POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params) m_nm = nm_default; - setActualProblemSize( std::max( std::max( m_ni*m_nj, m_nj*m_nl ), + setActualProblemSize( std::max( std::max( m_ni*m_nj, m_nj*m_nl ), m_ni*m_nl ) ); setItsPerRep( m_ni*m_nj + m_nj*m_nl + m_ni*m_nl ); @@ -62,7 +66,7 @@ POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params) 2 * m_nj*m_nl*m_nm + 2 * m_ni*m_nj*m_nl ); - checksum_scale_factor = 0.000000001 * + checksum_scale_factor = 0.000000001 * ( static_cast(getDefaultProblemSize()) / getActualProblemSize() ); @@ -121,5 +125,11 @@ void POLYBENCH_3MM::tearDown(VariantID vid) deallocData(m_G); } +bool POLYBENCH_3MM::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_3MM.hpp b/src/polybench/POLYBENCH_3MM.hpp index 80d0a2fe5..195384b1c 100644 --- a/src/polybench/POLYBENCH_3MM.hpp +++ b/src/polybench/POLYBENCH_3MM.hpp @@ -154,7 +154,16 @@ class POLYBENCH_3MM : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Index_type m_ni; Index_type m_nj; Index_type m_nk; diff --git a/src/polybench/POLYBENCH_ADI-Cuda.cpp b/src/polybench/POLYBENCH_ADI-Cuda.cpp index 57500408d..e6c46aed1 100644 --- a/src/polybench/POLYBENCH_ADI-Cuda.cpp +++ b/src/polybench/POLYBENCH_ADI-Cuda.cpp @@ -21,11 +21,6 @@ namespace rajaperf namespace polybench { -// -// Define thread block size for CUDA execution -// -const size_t block_size = 256; - #define POLYBENCH_ADI_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(U, m_U, m_n * m_n); \ allocAndInitCudaDeviceData(V, m_V, m_n * m_n); \ @@ -87,7 +82,8 @@ __global__ void adi_lam(const Index_type n, } -void POLYBENCH_ADI::runCudaVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_ADI::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -247,6 +243,15 @@ void POLYBENCH_ADI::runCudaVariant(VariantID vid) } } +void POLYBENCH_ADI::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_ADI : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_ADI-Hip.cpp b/src/polybench/POLYBENCH_ADI-Hip.cpp index 9c65190a4..8d0deceef 100644 --- a/src/polybench/POLYBENCH_ADI-Hip.cpp +++ b/src/polybench/POLYBENCH_ADI-Hip.cpp @@ -21,11 +21,6 @@ namespace rajaperf namespace polybench { -// -// Define thread block size for Hip execution -// -const size_t block_size = 256; - #define POLYBENCH_ADI_DATA_SETUP_HIP \ allocAndInitHipDeviceData(U, m_U, m_n * m_n); \ allocAndInitHipDeviceData(V, m_V, m_n * m_n); \ @@ -88,7 +83,8 @@ __global__ void adi_lam(const Index_type n, } -void POLYBENCH_ADI::runHipVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -105,14 +101,14 @@ void POLYBENCH_ADI::runHipVariant(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(n-2, block_size); - hipLaunchKernelGGL((adi1), + hipLaunchKernelGGL((adi1), dim3(grid_size), dim3(block_size), 0, 0, n, a, b, c, d, f, P, Q, U, V); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((adi2), + hipLaunchKernelGGL((adi2), dim3(grid_size), dim3(block_size), 0, 0, n, a, c, d, e, f, @@ -256,6 +252,15 @@ void POLYBENCH_ADI::runHipVariant(VariantID vid) } } +void POLYBENCH_ADI::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_ADI : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp index c36b41050..628c609ab 100644 --- a/src/polybench/POLYBENCH_ADI.cpp +++ b/src/polybench/POLYBENCH_ADI.cpp @@ -20,8 +20,12 @@ namespace polybench POLYBENCH_ADI::POLYBENCH_ADI(const RunParams& params) : KernelBase(rajaperf::Polybench_ADI, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + Index_type n_default = 1000; - + setDefaultProblemSize( (n_default-2) * (n_default-2) ); setDefaultReps(4); @@ -39,7 +43,7 @@ POLYBENCH_ADI::POLYBENCH_ADI(const RunParams& params) setFLOPsPerRep( m_tsteps * ( (15 + 2) * (m_n-2)*(m_n-2) + (15 + 2) * (m_n-2)*(m_n-2) ) ); - checksum_scale_factor = 0.0000001 * + checksum_scale_factor = 0.0000001 * ( static_cast(getDefaultProblemSize()) / getActualProblemSize() ); @@ -91,5 +95,11 @@ void POLYBENCH_ADI::tearDown(VariantID vid) deallocData(m_Q); } +bool POLYBENCH_ADI::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_ADI.hpp b/src/polybench/POLYBENCH_ADI.hpp index bec422925..4e75fd44a 100644 --- a/src/polybench/POLYBENCH_ADI.hpp +++ b/src/polybench/POLYBENCH_ADI.hpp @@ -196,7 +196,16 @@ class POLYBENCH_ADI : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Index_type m_n; Index_type m_tsteps; diff --git a/src/polybench/POLYBENCH_ATAX-Cuda.cpp b/src/polybench/POLYBENCH_ATAX-Cuda.cpp index 58d37fb80..87f26bf95 100644 --- a/src/polybench/POLYBENCH_ATAX-Cuda.cpp +++ b/src/polybench/POLYBENCH_ATAX-Cuda.cpp @@ -21,11 +21,6 @@ namespace rajaperf namespace polybench { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - #define POLYBENCH_ATAX_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(tmp, m_tmp, N); \ allocAndInitCudaDeviceData(y, m_y, N); \ @@ -81,7 +76,8 @@ __global__ void poly_atax_lam(Index_type N, } -void POLYBENCH_ATAX::runCudaVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_ATAX::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -227,7 +223,15 @@ void POLYBENCH_ATAX::runCudaVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_ATAX : Unknown Cuda variant id = " << vid << std::endl; } +} +void POLYBENCH_ATAX::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_ATAX : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_ATAX-Hip.cpp b/src/polybench/POLYBENCH_ATAX-Hip.cpp index 6d393a83b..c709f14fe 100644 --- a/src/polybench/POLYBENCH_ATAX-Hip.cpp +++ b/src/polybench/POLYBENCH_ATAX-Hip.cpp @@ -21,11 +21,6 @@ namespace rajaperf namespace polybench { - // - // Define thread block size for Hip execution - // - const size_t block_size = 256; - #define POLYBENCH_ATAX_DATA_SETUP_HIP \ allocAndInitHipDeviceData(tmp, m_tmp, N); \ allocAndInitHipDeviceData(y, m_y, N); \ @@ -81,7 +76,8 @@ __global__ void poly_atax_lam(Index_type N, } -void POLYBENCH_ATAX::runHipVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -96,12 +92,12 @@ void POLYBENCH_ATAX::runHipVariant(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); - hipLaunchKernelGGL((poly_atax_1), + hipLaunchKernelGGL((poly_atax_1), dim3(grid_size), dim3(block_size), 0, 0, A, x, y, tmp, N); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_atax_2), + hipLaunchKernelGGL((poly_atax_2), dim3(grid_size), dim3(block_size), 0, 0, A, tmp, y, N); hipErrchk( hipGetLastError() ); @@ -234,7 +230,15 @@ void POLYBENCH_ATAX::runHipVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_ATAX : Unknown Hip variant id = " << vid << std::endl; } +} +void POLYBENCH_ATAX::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_ATAX : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp index e06917239..de9e46fb8 100644 --- a/src/polybench/POLYBENCH_ATAX.cpp +++ b/src/polybench/POLYBENCH_ATAX.cpp @@ -21,6 +21,10 @@ namespace polybench POLYBENCH_ATAX::POLYBENCH_ATAX(const RunParams& params) : KernelBase(rajaperf::Polybench_ATAX, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + Index_type N_default = 1000; setDefaultProblemSize( N_default * N_default ); @@ -29,7 +33,7 @@ POLYBENCH_ATAX::POLYBENCH_ATAX(const RunParams& params) m_N = std::sqrt( getTargetProblemSize() )+1; - setActualProblemSize( m_N * m_N ); + setActualProblemSize( m_N * m_N ); setItsPerRep( m_N + m_N ); setKernelsPerRep(2); @@ -41,7 +45,7 @@ POLYBENCH_ATAX::POLYBENCH_ATAX(const RunParams& params) setFLOPsPerRep(2 * m_N*m_N + 2 * m_N*m_N ); - checksum_scale_factor = 0.001 * + checksum_scale_factor = 0.001 * ( static_cast(getDefaultProblemSize()) / getActualProblemSize() ); @@ -94,5 +98,11 @@ void POLYBENCH_ATAX::tearDown(VariantID vid) deallocData(m_A); } +bool POLYBENCH_ATAX::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_ATAX.hpp b/src/polybench/POLYBENCH_ATAX.hpp index d2c5ec63e..3470746c4 100644 --- a/src/polybench/POLYBENCH_ATAX.hpp +++ b/src/polybench/POLYBENCH_ATAX.hpp @@ -116,7 +116,16 @@ class POLYBENCH_ATAX : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Index_type m_N; Real_ptr m_tmp; Real_ptr m_y; diff --git a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp index a6c67b852..70aab149c 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp @@ -22,12 +22,10 @@ namespace polybench { // - // Define thread block size for CUDA execution + // Define thread block shape for CUDA execution // - const size_t block_size = 256; - - constexpr size_t j_block_sz = 32; - constexpr size_t i_block_sz = 8; +#define j_block_sz (32) +#define i_block_sz (block_size / j_block_sz) #define FDTD_2D_THREADS_PER_BLOCK_CUDA \ dim3 nthreads_per_block234(j_block_sz, i_block_sz, 1); @@ -72,7 +70,7 @@ __global__ void poly_fdtd2d_1_lam(Index_type ny, Lambda body) } } -__global__ void poly_fdtd2d_2(Real_ptr ey, Real_ptr hz, +__global__ void poly_fdtd2d_2(Real_ptr ey, Real_ptr hz, Index_type nx, Index_type ny) { Index_type i = blockIdx.y * blockDim.y + threadIdx.y; @@ -91,7 +89,7 @@ __global__ void poly_fdtd2d_2_lam(Index_type nx, Index_type ny, Index_type j = blockIdx.x * blockDim.x + threadIdx.x; if (i > 0 && i < nx && j < ny) { - body(i, j); + body(i, j); } } @@ -142,7 +140,8 @@ __global__ void poly_fdtd2d_4_lam(Index_type nx, Index_type ny, } -void POLYBENCH_FDTD_2D::runCudaVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -298,7 +297,15 @@ void POLYBENCH_FDTD_2D::runCudaVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_FDTD_2D : Unknown Cuda variant id = " << vid << std::endl; } +} +void POLYBENCH_FDTD_2D::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_FDTD_2D : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp index b627d84f8..b2493fc63 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp @@ -22,12 +22,10 @@ namespace polybench { // - // Define thread block size for Hip execution + // Define thread block shape for Hip execution // - const size_t block_size = 256; - - constexpr size_t j_block_sz = 32; - constexpr size_t i_block_sz = 8; +#define j_block_sz (32) +#define i_block_sz (block_size / j_block_sz) #define FDTD_2D_THREADS_PER_BLOCK_HIP \ dim3 nthreads_per_block234(j_block_sz, i_block_sz, 1); @@ -141,7 +139,8 @@ __global__ void poly_fdtd2d_4_lam(Index_type nx, Index_type ny, } -void POLYBENCH_FDTD_2D::runHipVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -157,25 +156,25 @@ void POLYBENCH_FDTD_2D::runHipVariant(VariantID vid) for (t = 0; t < tsteps; ++t) { const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(ny, block_size); - hipLaunchKernelGGL((poly_fdtd2d_1), - dim3(grid_size1), dim3(block_size), 0, 0, + hipLaunchKernelGGL((poly_fdtd2d_1), + dim3(grid_size1), dim3(block_size), 0, 0, ey, fict, ny, t); hipErrchk( hipGetLastError() ); FDTD_2D_THREADS_PER_BLOCK_HIP; FDTD_2D_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_fdtd2d_2), + hipLaunchKernelGGL((poly_fdtd2d_2), dim3(nblocks234), dim3(nthreads_per_block234), 0, 0, ey, hz, nx, ny); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_fdtd2d_3), + hipLaunchKernelGGL((poly_fdtd2d_3), dim3(nblocks234), dim3(nthreads_per_block234), 0, 0, ex, hz, nx, ny); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_fdtd2d_4), + hipLaunchKernelGGL((poly_fdtd2d_4), dim3(nblocks234), dim3(nthreads_per_block234), 0, 0, hz, ex, ey, nx, ny); hipErrchk( hipGetLastError() ); @@ -210,7 +209,7 @@ void POLYBENCH_FDTD_2D::runHipVariant(VariantID vid) FDTD_2D_THREADS_PER_BLOCK_HIP; FDTD_2D_NBLOCKS_HIP; - auto poly_fdtd2d_2_lambda = + auto poly_fdtd2d_2_lambda = [=] __device__ (Index_type i, Index_type j) { POLYBENCH_FDTD_2D_BODY2; }; @@ -220,7 +219,7 @@ void POLYBENCH_FDTD_2D::runHipVariant(VariantID vid) nx, ny, poly_fdtd2d_2_lambda); hipErrchk( hipGetLastError() ); - auto poly_fdtd2d_3_lambda = + auto poly_fdtd2d_3_lambda = [=] __device__ (Index_type i, Index_type j) { POLYBENCH_FDTD_2D_BODY3; }; @@ -229,8 +228,8 @@ void POLYBENCH_FDTD_2D::runHipVariant(VariantID vid) dim3(nblocks234), dim3(nthreads_per_block234), 0, 0, nx, ny, poly_fdtd2d_3_lambda); hipErrchk( hipGetLastError() ); - - auto poly_fdtd2d_4_lambda = + + auto poly_fdtd2d_4_lambda = [=] __device__ (Index_type i, Index_type j) { POLYBENCH_FDTD_2D_BODY4; }; @@ -316,7 +315,15 @@ void POLYBENCH_FDTD_2D::runHipVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_FDTD_2D : Unknown Hip variant id = " << vid << std::endl; } +} +void POLYBENCH_FDTD_2D::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_FDTD_2D : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp index 59e03721c..4eed13c96 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D.cpp @@ -24,10 +24,14 @@ namespace polybench POLYBENCH_FDTD_2D::POLYBENCH_FDTD_2D(const RunParams& params) : KernelBase(rajaperf::Polybench_FDTD_2D, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + Index_type nx_default = 1000; Index_type ny_default = 1000; - setDefaultProblemSize( std::max( (nx_default-1) * ny_default, + setDefaultProblemSize( std::max( (nx_default-1) * ny_default, nx_default * (ny_default-1) ) ); setDefaultReps(8); @@ -36,7 +40,7 @@ POLYBENCH_FDTD_2D::POLYBENCH_FDTD_2D(const RunParams& params) m_tsteps = 40; - setActualProblemSize( std::max( (m_nx-1)*m_ny, m_nx*(m_ny-1) ) ); + setActualProblemSize( std::max( (m_nx-1)*m_ny, m_nx*(m_ny-1) ) ); setItsPerRep( m_tsteps * ( m_ny + (m_nx-1)*m_ny + @@ -112,5 +116,11 @@ void POLYBENCH_FDTD_2D::tearDown(VariantID vid) deallocData(m_hz); } +bool POLYBENCH_FDTD_2D::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_FDTD_2D.hpp b/src/polybench/POLYBENCH_FDTD_2D.hpp index a1ead28b2..50e072fea 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.hpp +++ b/src/polybench/POLYBENCH_FDTD_2D.hpp @@ -114,7 +114,16 @@ class POLYBENCH_FDTD_2D : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Index_type m_nx; Index_type m_ny; Index_type m_tsteps; diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp index bc4d79352..e0d52d4f6 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp @@ -20,10 +20,10 @@ namespace polybench { // -// Define thread block size for CUDA execution +// Define thread block shape for CUDA execution // -constexpr size_t i_block_sz = 8; -constexpr size_t j_block_sz = 32; +#define j_block_sz (32) +#define i_block_sz (block_size / j_block_sz) #define POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_CUDA \ dim3 nthreads_per_block(j_block_sz, i_block_sz, 1); @@ -52,13 +52,13 @@ __global__ void poly_floyd_warshall(Real_ptr pout, Real_ptr pin, Index_type i = blockIdx.y * blockDim.y + threadIdx.y; Index_type j = blockIdx.x * blockDim.x + threadIdx.x; - if ( i < N && j < N ) { + if ( i < N && j < N ) { POLYBENCH_FLOYD_WARSHALL_BODY; } } template< typename Lambda > -__global__ void poly_floyd_warshall_lam(Index_type N, +__global__ void poly_floyd_warshall_lam(Index_type N, Lambda body) { Index_type i = blockIdx.y * blockDim.y + threadIdx.y; @@ -70,7 +70,8 @@ __global__ void poly_floyd_warshall_lam(Index_type N, } -void POLYBENCH_FLOYD_WARSHALL::runCudaVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_FLOYD_WARSHALL::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -87,7 +88,7 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariant(VariantID vid) POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_CUDA; POLY_FLOYD_WARSHALL_NBLOCKS_CUDA; - + poly_floyd_warshall<<>>(pout, pin, k, N); cudaErrchk( cudaGetLastError() ); @@ -168,7 +169,15 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_FLOYD_WARSHALL : Unknown Cuda variant id = " << vid << std::endl; } +} +void POLYBENCH_FLOYD_WARSHALL::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_FLOYD_WARSHALL : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp index af451b139..7dd03d9be 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp @@ -20,10 +20,10 @@ namespace polybench { // -// Define thread block size for Hip execution +// Define thread block shape for Hip execution // -constexpr size_t i_block_sz = 8; -constexpr size_t j_block_sz = 32; +#define j_block_sz (32) +#define i_block_sz (block_size / j_block_sz) #define POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_HIP \ dim3 nthreads_per_block(j_block_sz, i_block_sz, 1); @@ -69,7 +69,8 @@ __global__ void poly_floyd_warshall_lam(Index_type N, } -void POLYBENCH_FLOYD_WARSHALL::runHipVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_FLOYD_WARSHALL::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -109,13 +110,13 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariant(VariantID vid) for (Index_type k = 0; k < N; ++k) { - auto poly_floyd_warshall_lambda = + auto poly_floyd_warshall_lambda = [=] __device__ (Index_type i, Index_type j) { POLYBENCH_FLOYD_WARSHALL_BODY; }; POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_HIP; - POLY_FLOYD_WARSHALL_NBLOCKS_HIP; + POLY_FLOYD_WARSHALL_NBLOCKS_HIP; hipLaunchKernelGGL( (poly_floyd_warshall_lam), @@ -174,7 +175,15 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_FLOYD_WARSHALL : Unknown Hip variant id = " << vid << std::endl; } +} +void POLYBENCH_FLOYD_WARSHALL::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_FLOYD_WARSHALL : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp index b3306a992..27991d45b 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp @@ -21,9 +21,13 @@ namespace polybench POLYBENCH_FLOYD_WARSHALL::POLYBENCH_FLOYD_WARSHALL(const RunParams& params) : KernelBase(rajaperf::Polybench_FLOYD_WARSHALL, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + Index_type N_default = 1000; - setDefaultProblemSize( N_default * N_default ); + setDefaultProblemSize( N_default * N_default ); setDefaultReps(8); m_N = std::sqrt( getTargetProblemSize() ) + 1; @@ -85,5 +89,11 @@ void POLYBENCH_FLOYD_WARSHALL::tearDown(VariantID vid) deallocData(m_pout); } +bool POLYBENCH_FLOYD_WARSHALL::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp index ec2bcab9f..d7c81df0a 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp @@ -77,7 +77,16 @@ class POLYBENCH_FLOYD_WARSHALL : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Index_type m_N; Real_ptr m_pin; diff --git a/src/polybench/POLYBENCH_GEMM-Cuda.cpp b/src/polybench/POLYBENCH_GEMM-Cuda.cpp index ae586d1f4..deaeb7af3 100644 --- a/src/polybench/POLYBENCH_GEMM-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMM-Cuda.cpp @@ -22,10 +22,10 @@ namespace polybench { // -// Define thread block size for CUDA execution +// Define thread block shape for CUDA execution // -constexpr size_t i_block_sz = 8; -constexpr size_t j_block_sz = 32; +#define j_block_sz (32) +#define i_block_sz (block_size / j_block_sz) #define POLY_GEMM_THREADS_PER_BLOCK_CUDA \ dim3 nthreads_per_block(j_block_sz, i_block_sz, 1); @@ -79,7 +79,8 @@ __global__ void poly_gemm_lam(Index_type ni, Index_type nj, } -void POLYBENCH_GEMM::runCudaVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_GEMM::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -93,7 +94,7 @@ void POLYBENCH_GEMM::runCudaVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { POLY_GEMM_THREADS_PER_BLOCK_CUDA; - POLY_GEMM_NBLOCKS_CUDA; + POLY_GEMM_NBLOCKS_CUDA; poly_gemm<<>>(C, A, B, alpha, beta, @@ -194,7 +195,15 @@ void POLYBENCH_GEMM::runCudaVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_GEMM : Unknown Cuda variant id = " << vid << std::endl; } +} +void POLYBENCH_GEMM::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_GEMM : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_GEMM-Hip.cpp b/src/polybench/POLYBENCH_GEMM-Hip.cpp index 2d07f0a86..9baf86628 100644 --- a/src/polybench/POLYBENCH_GEMM-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMM-Hip.cpp @@ -22,10 +22,10 @@ namespace polybench { // -// Define thread block size for Hip execution +// Define thread block shape for Hip execution // -constexpr size_t i_block_sz = 8; -constexpr size_t j_block_sz = 32; +#define j_block_sz (32) +#define i_block_sz (block_size / j_block_sz) #define POLY_GEMM_THREADS_PER_BLOCK_HIP \ dim3 nthreads_per_block(j_block_sz, i_block_sz, 1); @@ -79,7 +79,8 @@ __global__ void poly_gemm_lam(Index_type ni, Index_type nj, } -void POLYBENCH_GEMM::runHipVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_GEMM::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -93,9 +94,9 @@ void POLYBENCH_GEMM::runHipVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { POLY_GEMM_THREADS_PER_BLOCK_HIP; - POLY_GEMM_NBLOCKS_HIP; + POLY_GEMM_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_gemm), + hipLaunchKernelGGL((poly_gemm), dim3(nblocks), dim3(nthreads_per_block), 0, 0, C, A, B, alpha, beta, ni, nj, nk); @@ -114,7 +115,7 @@ void POLYBENCH_GEMM::runHipVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { POLY_GEMM_THREADS_PER_BLOCK_HIP; - POLY_GEMM_NBLOCKS_HIP; + POLY_GEMM_NBLOCKS_HIP; auto poly_gemm_lambda = [=] __device__ (Index_type i, Index_type j) { POLYBENCH_GEMM_BODY1; @@ -125,7 +126,7 @@ void POLYBENCH_GEMM::runHipVariant(VariantID vid) POLYBENCH_GEMM_BODY4; }; - hipLaunchKernelGGL((poly_gemm_lam), + hipLaunchKernelGGL((poly_gemm_lam), dim3(nblocks), dim3(nthreads_per_block), 0, 0, ni, nj, poly_gemm_lambda); hipErrchk( hipGetLastError() ); @@ -197,7 +198,15 @@ void POLYBENCH_GEMM::runHipVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_GEMM : Unknown Hip variant id = " << vid << std::endl; } +} +void POLYBENCH_GEMM::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_GEMM : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp index a50ac09da..4376ae8eb 100644 --- a/src/polybench/POLYBENCH_GEMM.cpp +++ b/src/polybench/POLYBENCH_GEMM.cpp @@ -21,6 +21,10 @@ namespace polybench POLYBENCH_GEMM::POLYBENCH_GEMM(const RunParams& params) : KernelBase(rajaperf::Polybench_GEMM, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + Index_type ni_default = 1000; Index_type nj_default = 1000; Index_type nk_default = 1200; @@ -31,7 +35,7 @@ POLYBENCH_GEMM::POLYBENCH_GEMM(const RunParams& params) m_ni = std::sqrt( getTargetProblemSize() ) + 1; m_nj = m_ni; m_nk = nk_default; - + m_alpha = 0.62; m_beta = 1.002; @@ -97,5 +101,11 @@ void POLYBENCH_GEMM::tearDown(VariantID vid) deallocData(m_C); } +bool POLYBENCH_GEMM::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_GEMM.hpp b/src/polybench/POLYBENCH_GEMM.hpp index dd9e4a5a7..65bddc907 100644 --- a/src/polybench/POLYBENCH_GEMM.hpp +++ b/src/polybench/POLYBENCH_GEMM.hpp @@ -100,7 +100,16 @@ class POLYBENCH_GEMM : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Index_type m_ni; Index_type m_nj; Index_type m_nk; diff --git a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp index 8d2ddca87..7c7763955 100644 --- a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp @@ -22,12 +22,10 @@ namespace polybench { // -// Define thread block size for CUDA execution +// Define thread block shape for CUDA execution // -const size_t block_size = 256; - -constexpr size_t i_block_sz = 8; -constexpr size_t j_block_sz = 32; +#define j_block_sz (32) +#define i_block_sz (block_size / j_block_sz) #define GEMVER_THREADS_PER_BLOCK_CUDA \ dim3 nthreads_per_block1(j_block_sz, i_block_sz, 1); @@ -135,7 +133,8 @@ __global__ void poly_gemmver_234_lam(Index_type n, Lambda body) } -void POLYBENCH_GEMVER::runCudaVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -250,7 +249,7 @@ void POLYBENCH_GEMVER::runCudaVariant(VariantID vid) > > > - >; + >; using EXEC_POL24 = RAJA::KernelPolicy< @@ -326,7 +325,15 @@ void POLYBENCH_GEMVER::runCudaVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_GEMVER : Unknown Cuda variant id = " << vid << std::endl; } +} +void POLYBENCH_GEMVER::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_GEMVER : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_GEMVER-Hip.cpp b/src/polybench/POLYBENCH_GEMVER-Hip.cpp index 469f620a3..11e4f0c85 100644 --- a/src/polybench/POLYBENCH_GEMVER-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Hip.cpp @@ -22,12 +22,10 @@ namespace polybench { // -// Define thread block size for Hip execution +// Define thread block shape for Hip execution // -const size_t block_size = 256; - -constexpr size_t i_block_sz = 8; -constexpr size_t j_block_sz = 32; +#define j_block_sz (32) +#define i_block_sz (block_size / j_block_sz) #define GEMVER_THREADS_PER_BLOCK_HIP \ dim3 nthreads_per_block1(j_block_sz, i_block_sz, 1); @@ -135,7 +133,8 @@ __global__ void poly_gemmver_234_lam(Index_type n, Lambda body) } -void POLYBENCH_GEMVER::runHipVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -151,24 +150,24 @@ void POLYBENCH_GEMVER::runHipVariant(VariantID vid) GEMVER_THREADS_PER_BLOCK_HIP; GEMVER_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_gemmver_1), + hipLaunchKernelGGL((poly_gemmver_1), dim3(nblocks1), dim3(nthreads_per_block1), 0, 0, A, u1, v1, u2, v2, n); hipErrchk( hipGetLastError() ); size_t grid_size = RAJA_DIVIDE_CEILING_INT(m_n, block_size); - hipLaunchKernelGGL((poly_gemmver_2), + hipLaunchKernelGGL((poly_gemmver_2), dim3(grid_size), dim3(block_size), 0, 0, A, x, y, beta, n); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_gemmver_3), + hipLaunchKernelGGL((poly_gemmver_3), dim3(grid_size), dim3(block_size), 0, 0, x, z, n); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_gemmver_4), + hipLaunchKernelGGL((poly_gemmver_4), dim3(grid_size), dim3(block_size), 0, 0, A, x, w, alpha, n); hipErrchk( hipGetLastError() ); @@ -194,7 +193,7 @@ void POLYBENCH_GEMVER::runHipVariant(VariantID vid) hipLaunchKernelGGL(poly_gemmver_1_lam, dim3(nblocks1), dim3(nthreads_per_block1), 0, 0, - n, poly_gemmver_1_lambda); + n, poly_gemmver_1_lambda); hipErrchk( hipGetLastError() ); size_t grid_size = RAJA_DIVIDE_CEILING_INT(n, block_size); @@ -260,7 +259,7 @@ void POLYBENCH_GEMVER::runHipVariant(VariantID vid) > > > - >; + >; using EXEC_POL24 = RAJA::KernelPolicy< @@ -336,7 +335,15 @@ void POLYBENCH_GEMVER::runHipVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_GEMVER : Unknown Hip variant id = " << vid << std::endl; } +} +void POLYBENCH_GEMVER::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_GEMVER : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp index fce83907a..817946cb6 100644 --- a/src/polybench/POLYBENCH_GEMVER.cpp +++ b/src/polybench/POLYBENCH_GEMVER.cpp @@ -21,6 +21,10 @@ namespace polybench POLYBENCH_GEMVER::POLYBENCH_GEMVER(const RunParams& params) : KernelBase(rajaperf::Polybench_GEMVER, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + Index_type n_default = 1000; setDefaultProblemSize( n_default * n_default ); @@ -119,5 +123,11 @@ void POLYBENCH_GEMVER::tearDown(VariantID vid) deallocData(m_z); } +bool POLYBENCH_GEMVER::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_GEMVER.hpp b/src/polybench/POLYBENCH_GEMVER.hpp index 919f18e5c..f56ba1b02 100644 --- a/src/polybench/POLYBENCH_GEMVER.hpp +++ b/src/polybench/POLYBENCH_GEMVER.hpp @@ -15,8 +15,8 @@ /// } /// } /// -/// Note: this part of the kernel is modified to avoid -/// excessively large checksums +/// Note: this part of the kernel is modified to avoid +/// excessively large checksums /// for (Index_type i = 0; i < N; i++) { /// Real_type dot = 0.0; /// for (Index_type j = 0; j < N; j++) { @@ -153,7 +153,16 @@ class POLYBENCH_GEMVER : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Index_type m_n; Real_type m_alpha; Real_type m_beta; diff --git a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp index 3fdac4fd8..b53c89ed4 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp @@ -21,11 +21,6 @@ namespace rajaperf namespace polybench { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - #define POLYBENCH_GESUMMV_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(x, m_x, N); \ allocAndInitCudaDeviceData(y, m_y, N); \ @@ -58,7 +53,8 @@ __global__ void poly_gesummv(Real_ptr x, Real_ptr y, } -void POLYBENCH_GESUMMV::runCudaVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_GESUMMV::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -137,7 +133,15 @@ void POLYBENCH_GESUMMV::runCudaVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_GESUMMV : Unknown Cuda variant id = " << vid << std::endl; } +} +void POLYBENCH_GESUMMV::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_GESUMMV : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp index 1fec5379b..461923cef 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp @@ -21,11 +21,6 @@ namespace rajaperf namespace polybench { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - #define POLYBENCH_GESUMMV_DATA_SETUP_HIP \ allocAndInitHipDeviceData(x, m_x, N); \ allocAndInitHipDeviceData(y, m_y, N); \ @@ -58,7 +53,8 @@ __global__ void poly_gesummv(Real_ptr x, Real_ptr y, } -void POLYBENCH_GESUMMV::runHipVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_GESUMMV::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -73,7 +69,7 @@ void POLYBENCH_GESUMMV::runHipVariant(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); - hipLaunchKernelGGL((poly_gesummv), + hipLaunchKernelGGL((poly_gesummv), dim3(grid_size), dim3(block_size),0,0, x, y, A, B, @@ -139,7 +135,15 @@ void POLYBENCH_GESUMMV::runHipVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_GESUMMV : Unknown Hip variant id = " << vid << std::endl; } +} +void POLYBENCH_GESUMMV::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_GESUMMV : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp index 39cb94510..deef5412e 100644 --- a/src/polybench/POLYBENCH_GESUMMV.cpp +++ b/src/polybench/POLYBENCH_GESUMMV.cpp @@ -21,10 +21,14 @@ namespace polybench POLYBENCH_GESUMMV::POLYBENCH_GESUMMV(const RunParams& params) : KernelBase(rajaperf::Polybench_GESUMMV, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + Index_type N_default = 1000; setDefaultProblemSize( N_default * N_default ); - setDefaultReps(120); + setDefaultReps(120); m_N = std::sqrt( getTargetProblemSize() ) + 1; @@ -88,5 +92,11 @@ void POLYBENCH_GESUMMV::tearDown(VariantID vid) deallocData(m_B); } +bool POLYBENCH_GESUMMV::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_GESUMMV.hpp b/src/polybench/POLYBENCH_GESUMMV.hpp index c8cc9e191..196572ff3 100644 --- a/src/polybench/POLYBENCH_GESUMMV.hpp +++ b/src/polybench/POLYBENCH_GESUMMV.hpp @@ -99,7 +99,16 @@ class POLYBENCH_GESUMMV : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Index_type m_N; Real_type m_alpha; diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp index 35f104444..7bdc30d91 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp @@ -21,11 +21,6 @@ namespace rajaperf namespace polybench { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - #define POLYBENCH_JACOBI_1D_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(A, m_Ainit, m_N); \ allocAndInitCudaDeviceData(B, m_Binit, m_N); @@ -57,7 +52,8 @@ __global__ void poly_jacobi_1D_2(Real_ptr A, Real_ptr B, Index_type N) } -void POLYBENCH_JACOBI_1D::runCudaVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_JACOBI_1D::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -118,7 +114,15 @@ void POLYBENCH_JACOBI_1D::runCudaVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_JACOBI_1D : Unknown Cuda variant id = " << vid << std::endl; } +} +void POLYBENCH_JACOBI_1D::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_JACOBI_1D : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp index d566cd430..ccdf47134 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp @@ -21,11 +21,6 @@ namespace rajaperf namespace polybench { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - #define POLYBENCH_JACOBI_1D_DATA_SETUP_HIP \ allocAndInitHipDeviceData(A, m_Ainit, m_N); \ allocAndInitHipDeviceData(B, m_Binit, m_N); @@ -57,7 +52,8 @@ __global__ void poly_jacobi_1D_2(Real_ptr A, Real_ptr B, Index_type N) } -void POLYBENCH_JACOBI_1D::runHipVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_JACOBI_1D::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -120,7 +116,15 @@ void POLYBENCH_JACOBI_1D::runHipVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_JACOBI_1D : Unknown Hip variant id = " << vid << std::endl; } +} +void POLYBENCH_JACOBI_1D::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_JACOBI_1D : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp index 48c064780..88204d1e1 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp @@ -21,12 +21,16 @@ namespace polybench POLYBENCH_JACOBI_1D::POLYBENCH_JACOBI_1D(const RunParams& params) : KernelBase(rajaperf::Polybench_JACOBI_1D, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + Index_type N_default = 1000000; setDefaultProblemSize( N_default-2 ); setDefaultReps(100); - - m_N = getTargetProblemSize(); + + m_N = getTargetProblemSize(); m_tsteps = 16; @@ -34,13 +38,13 @@ POLYBENCH_JACOBI_1D::POLYBENCH_JACOBI_1D(const RunParams& params) setItsPerRep( m_tsteps * ( 2 * getActualProblemSize() ) ); setKernelsPerRep(m_tsteps * 2); - setBytesPerRep( m_tsteps * ( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * - (m_N-2) + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * + setBytesPerRep( m_tsteps * ( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * + (m_N-2) + + (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N + - (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * + (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * (m_N-2) + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * + (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N ) ); setFLOPsPerRep( m_tsteps * ( 3 * (m_N-2) + 3 * (m_N-2) ) ); @@ -97,5 +101,11 @@ void POLYBENCH_JACOBI_1D::tearDown(VariantID vid) deallocData(m_Binit); } +bool POLYBENCH_JACOBI_1D::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_JACOBI_1D.hpp b/src/polybench/POLYBENCH_JACOBI_1D.hpp index 290e26ce0..7ff522400 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.hpp @@ -71,7 +71,16 @@ class POLYBENCH_JACOBI_1D : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Index_type m_N; Index_type m_tsteps; diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp index a32a9cce6..d46d0ec22 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp @@ -22,10 +22,10 @@ namespace polybench { // - // Define thread block size for CUDA execution + // Define thread block shape for CUDA execution // - constexpr size_t i_block_sz = 8; - constexpr size_t j_block_sz = 32; +#define j_block_sz (32) +#define i_block_sz (block_size / j_block_sz) #define JACOBI_2D_THREADS_PER_BLOCK_CUDA \ dim3 nthreads_per_block(j_block_sz, i_block_sz, 1); @@ -80,7 +80,8 @@ __global__ void poly_jacobi_2D_lam(Index_type N, Lambda body) } -void POLYBENCH_JACOBI_2D::runCudaVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -165,7 +166,7 @@ void POLYBENCH_JACOBI_2D::runCudaVariant(VariantID vid) > > > - >; + >; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -196,7 +197,15 @@ void POLYBENCH_JACOBI_2D::runCudaVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_JACOBI_2D : Unknown Cuda variant id = " << vid << std::endl; } +} +void POLYBENCH_JACOBI_2D::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_JACOBI_2D : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp index dd7230205..8c0408b7e 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp @@ -22,10 +22,10 @@ namespace polybench { // - // Define thread block size for Hip execution + // Define thread block shape for Hip execution // - constexpr size_t i_block_sz = 8; - constexpr size_t j_block_sz = 32; +#define j_block_sz (32) +#define i_block_sz (block_size / j_block_sz) #define JACOBI_2D_THREADS_PER_BLOCK_HIP \ dim3 nthreads_per_block(j_block_sz, i_block_sz, 1); @@ -80,7 +80,8 @@ __global__ void poly_jacobi_2D_lam(Index_type N, Lambda body) } -void POLYBENCH_JACOBI_2D::runHipVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -127,7 +128,7 @@ void POLYBENCH_JACOBI_2D::runHipVariant(VariantID vid) JACOBI_2D_THREADS_PER_BLOCK_HIP; JACOBI_2D_NBLOCKS_HIP; - auto poly_jacobi_2D_1_lambda = + auto poly_jacobi_2D_1_lambda = [=] __device__ (Index_type i, Index_type j) { POLYBENCH_JACOBI_2D_BODY1; }; @@ -137,7 +138,7 @@ void POLYBENCH_JACOBI_2D::runHipVariant(VariantID vid) N, poly_jacobi_2D_1_lambda); hipErrchk( hipGetLastError() ); - auto poly_jacobi_2D_2_lambda = + auto poly_jacobi_2D_2_lambda = [=] __device__ (Index_type i, Index_type j) { POLYBENCH_JACOBI_2D_BODY2; }; @@ -206,7 +207,15 @@ void POLYBENCH_JACOBI_2D::runHipVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_JACOBI_2D : Unknown Hip variant id = " << vid << std::endl; } +} +void POLYBENCH_JACOBI_2D::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_JACOBI_2D : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp index 9e204bdab..1e9b5e396 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp @@ -21,6 +21,10 @@ namespace polybench POLYBENCH_JACOBI_2D::POLYBENCH_JACOBI_2D(const RunParams& params) : KernelBase(rajaperf::Polybench_JACOBI_2D, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + Index_type N_default = 1000; setDefaultProblemSize( N_default * N_default ); @@ -99,5 +103,11 @@ void POLYBENCH_JACOBI_2D::tearDown(VariantID vid) deallocData(m_Binit); } +bool POLYBENCH_JACOBI_2D::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_JACOBI_2D.hpp b/src/polybench/POLYBENCH_JACOBI_2D.hpp index 9a57325a1..ce063836b 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.hpp @@ -91,7 +91,16 @@ class POLYBENCH_JACOBI_2D : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Index_type m_N; Index_type m_tsteps; diff --git a/src/polybench/POLYBENCH_MVT-Cuda.cpp b/src/polybench/POLYBENCH_MVT-Cuda.cpp index 2a59f018f..b8900c23c 100644 --- a/src/polybench/POLYBENCH_MVT-Cuda.cpp +++ b/src/polybench/POLYBENCH_MVT-Cuda.cpp @@ -21,11 +21,6 @@ namespace rajaperf namespace polybench { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - #define POLYBENCH_MVT_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(x1, m_x1, N); \ allocAndInitCudaDeviceData(x2, m_x2, N); \ @@ -73,7 +68,8 @@ __global__ void poly_mvt_2(Real_ptr A, Real_ptr x2, Real_ptr y2, } -void POLYBENCH_MVT::runCudaVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_MVT::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -176,7 +172,15 @@ void POLYBENCH_MVT::runCudaVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_MVT : Unknown Cuda variant id = " << vid << std::endl; } +} +void POLYBENCH_MVT::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_MVT : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_MVT-Hip.cpp b/src/polybench/POLYBENCH_MVT-Hip.cpp index 00619eee5..4412c0966 100644 --- a/src/polybench/POLYBENCH_MVT-Hip.cpp +++ b/src/polybench/POLYBENCH_MVT-Hip.cpp @@ -21,11 +21,6 @@ namespace rajaperf namespace polybench { - // - // Define thread block size for Hip execution - // - const size_t block_size = 256; - #define POLYBENCH_MVT_DATA_SETUP_HIP \ allocAndInitHipDeviceData(x1, m_x1, N); \ allocAndInitHipDeviceData(x2, m_x2, N); \ @@ -73,7 +68,8 @@ __global__ void poly_mvt_2(Real_ptr A, Real_ptr x2, Real_ptr y2, } -void POLYBENCH_MVT::runHipVariant(VariantID vid) +template < size_t block_size > +void POLYBENCH_MVT::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -174,7 +170,15 @@ void POLYBENCH_MVT::runHipVariant(VariantID vid) } else { std::cout << "\n POLYBENCH_MVT : Unknown Hip variant id = " << vid << std::endl; } +} +void POLYBENCH_MVT::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n POLYBENCH_MVT : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp index ae2749ce5..51ca483d5 100644 --- a/src/polybench/POLYBENCH_MVT.cpp +++ b/src/polybench/POLYBENCH_MVT.cpp @@ -21,6 +21,10 @@ namespace polybench POLYBENCH_MVT::POLYBENCH_MVT(const RunParams& params) : KernelBase(rajaperf::Polybench_MVT, params) { + setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + Index_type N_default = 1000; setDefaultProblemSize( N_default * N_default ); @@ -94,5 +98,11 @@ void POLYBENCH_MVT::tearDown(VariantID vid) deallocData(m_A); } +bool POLYBENCH_MVT::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_MVT.hpp b/src/polybench/POLYBENCH_MVT.hpp index cb72784ed..b72b9b9f4 100644 --- a/src/polybench/POLYBENCH_MVT.hpp +++ b/src/polybench/POLYBENCH_MVT.hpp @@ -113,7 +113,16 @@ class POLYBENCH_MVT : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Index_type m_N; Real_ptr m_x1; Real_ptr m_x2; From 6b7a67bcc33c3f98360a591816f8708dfe19ad6d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sat, 23 Oct 2021 15:46:48 -0700 Subject: [PATCH 149/392] Template polybench gpu kernels on block_size --- src/polybench/POLYBENCH_2MM-Cuda.cpp | 43 +++++++---- src/polybench/POLYBENCH_2MM-Hip.cpp | 37 +++++---- src/polybench/POLYBENCH_3MM-Cuda.cpp | 62 +++++++++------ src/polybench/POLYBENCH_3MM-Hip.cpp | 56 ++++++++------ src/polybench/POLYBENCH_ADI-Cuda.cpp | 21 ++++-- src/polybench/POLYBENCH_ADI-Hip.cpp | 21 ++++-- src/polybench/POLYBENCH_ATAX-Cuda.cpp | 21 ++++-- src/polybench/POLYBENCH_ATAX-Hip.cpp | 21 ++++-- src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp | 75 ++++++++++++------- src/polybench/POLYBENCH_FDTD_2D-Hip.cpp | 69 ++++++++++------- .../POLYBENCH_FLOYD_WARSHALL-Cuda.cpp | 24 ++++-- .../POLYBENCH_FLOYD_WARSHALL-Hip.cpp | 22 ++++-- src/polybench/POLYBENCH_GEMM-Cuda.cpp | 24 ++++-- src/polybench/POLYBENCH_GEMM-Hip.cpp | 14 +++- src/polybench/POLYBENCH_GEMVER-Cuda.cpp | 53 ++++++++----- src/polybench/POLYBENCH_GEMVER-Hip.cpp | 51 ++++++++----- src/polybench/POLYBENCH_GESUMMV-Cuda.cpp | 6 +- src/polybench/POLYBENCH_GESUMMV-Hip.cpp | 6 +- src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp | 12 ++- src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp | 12 ++- src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp | 36 ++++++--- src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp | 32 +++++--- src/polybench/POLYBENCH_MVT-Cuda.cpp | 12 ++- src/polybench/POLYBENCH_MVT-Hip.cpp | 12 ++- 24 files changed, 473 insertions(+), 269 deletions(-) diff --git a/src/polybench/POLYBENCH_2MM-Cuda.cpp b/src/polybench/POLYBENCH_2MM-Cuda.cpp index 1186cdb8f..eca34c373 100644 --- a/src/polybench/POLYBENCH_2MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_2MM-Cuda.cpp @@ -27,8 +27,11 @@ namespace polybench #define in_block_sz (32) #define out_block_sz (block_size / in_block_sz) +#define POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \ + in_block_sz, out_block_sz + #define POLY_2MM_THREADS_PER_BLOCK_CUDA \ - dim3 nthreads_per_block(in_block_sz, out_block_sz, 1); + dim3 nthreads_per_block(POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA, 1); #define POLY_2MM_1_NBLOCKS_CUDA \ dim3 nblocks1(static_cast(RAJA_DIVIDE_CEILING_INT(nj, in_block_sz)), \ @@ -58,12 +61,14 @@ namespace polybench deallocCudaDeviceData(D); +template < size_t in_block_size, size_t out_block_size > +__launch_bounds__(in_block_size*out_block_size) __global__ void poly_2mm_1(Real_ptr tmp, Real_ptr A, Real_ptr B, Real_type alpha, Index_type ni, Index_type nj, Index_type nk) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * out_block_size + threadIdx.y; + Index_type j = blockIdx.x * in_block_size + threadIdx.x; if ( i < ni && j < nj ) { POLYBENCH_2MM_BODY1; @@ -74,24 +79,27 @@ __global__ void poly_2mm_1(Real_ptr tmp, Real_ptr A, Real_ptr B, } } -template< typename Lambda > +template < size_t in_block_size, size_t out_block_size, typename Lambda > +__launch_bounds__(in_block_size*out_block_size) __global__ void poly_2mm_1_lam(Index_type ni, Index_type nj, Lambda body) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * out_block_size + threadIdx.y; + Index_type j = blockIdx.x * in_block_size + threadIdx.x; if ( i < ni && j < nj ) { body(i, j); } } +template < size_t in_block_size, size_t out_block_size > +__launch_bounds__(in_block_size*out_block_size) __global__ void poly_2mm_2(Real_ptr tmp, Real_ptr C, Real_ptr D, Real_type beta, Index_type ni, Index_type nl, Index_type nj) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type l = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * out_block_size + threadIdx.y; + Index_type l = blockIdx.x * in_block_size + threadIdx.x; if ( i < ni && l < nl ) { POLYBENCH_2MM_BODY4; @@ -102,12 +110,13 @@ __global__ void poly_2mm_2(Real_ptr tmp, Real_ptr C, Real_ptr D, } } -template< typename Lambda > +template < size_t in_block_size, size_t out_block_size, typename Lambda > +__launch_bounds__(in_block_size*out_block_size) __global__ void poly_2mm_2_lam(Index_type ni, Index_type nl, Lambda body) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type l = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * out_block_size + threadIdx.y; + Index_type l = blockIdx.x * in_block_size + threadIdx.x; if ( i < ni && l < nl ) { body(i, l); @@ -132,12 +141,14 @@ void POLYBENCH_2MM::runCudaVariantImpl(VariantID vid) POLY_2MM_THREADS_PER_BLOCK_CUDA; POLY_2MM_1_NBLOCKS_CUDA; - poly_2mm_1<<>>(tmp, A, B, alpha, + poly_2mm_1 + <<>>(tmp, A, B, alpha, ni, nj, nk); cudaErrchk( cudaGetLastError() ); POLY_2MM_2_NBLOCKS_CUDA; - poly_2mm_2<<>>(tmp, C, D, beta, + poly_2mm_2 + <<>>(tmp, C, D, beta, ni, nl, nj); cudaErrchk( cudaGetLastError() ); @@ -156,7 +167,8 @@ void POLYBENCH_2MM::runCudaVariantImpl(VariantID vid) POLY_2MM_THREADS_PER_BLOCK_CUDA; POLY_2MM_1_NBLOCKS_CUDA; - poly_2mm_1_lam<<>>(ni, nj, + poly_2mm_1_lam + <<>>(ni, nj, [=] __device__ (Index_type i, Index_type j) { POLYBENCH_2MM_BODY1; for (Index_type k=0; k < nk; ++k) { @@ -168,7 +180,8 @@ void POLYBENCH_2MM::runCudaVariantImpl(VariantID vid) cudaErrchk( cudaGetLastError() ); POLY_2MM_2_NBLOCKS_CUDA; - poly_2mm_2_lam<<>>(ni, nl, + poly_2mm_2_lam + <<>>(ni, nl, [=] __device__ (Index_type i, Index_type l) { POLYBENCH_2MM_BODY4; for (Index_type j=0; j < nj; ++j) { diff --git a/src/polybench/POLYBENCH_2MM-Hip.cpp b/src/polybench/POLYBENCH_2MM-Hip.cpp index 10b3eee61..4358b061d 100644 --- a/src/polybench/POLYBENCH_2MM-Hip.cpp +++ b/src/polybench/POLYBENCH_2MM-Hip.cpp @@ -27,6 +27,9 @@ namespace polybench #define in_block_sz (32) #define out_block_sz (block_size / in_block_sz) +#define POLY_2MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \ + in_block_sz, out_block_sz + #define POLY_2MM_THREADS_PER_BLOCK_HIP \ dim3 nthreads_per_block(in_block_sz, out_block_sz, 1); @@ -57,12 +60,14 @@ namespace polybench deallocHipDeviceData(C); \ deallocHipDeviceData(D); +template < size_t in_block_size, size_t out_block_size > +__launch_bounds__(in_block_size*out_block_size) __global__ void poly_2mm_1(Real_ptr tmp, Real_ptr A, Real_ptr B, Real_type alpha, Index_type ni, Index_type nj, Index_type nk) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * out_block_size + threadIdx.y; + Index_type j = blockIdx.x * in_block_size + threadIdx.x; if ( i < ni && j < nj ) { POLYBENCH_2MM_BODY1; @@ -73,24 +78,27 @@ __global__ void poly_2mm_1(Real_ptr tmp, Real_ptr A, Real_ptr B, } } -template< typename Lambda > +template < size_t in_block_size, size_t out_block_size, typename Lambda > +__launch_bounds__(in_block_size*out_block_size) __global__ void poly_2mm_1_lam(Index_type ni, Index_type nj, Lambda body) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * out_block_size + threadIdx.y; + Index_type j = blockIdx.x * in_block_size + threadIdx.x; if ( i < ni && j < nj ) { body(i, j); } } +template < size_t in_block_size, size_t out_block_size > +__launch_bounds__(in_block_size*out_block_size) __global__ void poly_2mm_2(Real_ptr tmp, Real_ptr C, Real_ptr D, Real_type beta, Index_type ni, Index_type nl, Index_type nj) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type l = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * out_block_size + threadIdx.y; + Index_type l = blockIdx.x * in_block_size + threadIdx.x; if ( i < ni && l < nl ) { POLYBENCH_2MM_BODY4; @@ -101,12 +109,13 @@ __global__ void poly_2mm_2(Real_ptr tmp, Real_ptr C, Real_ptr D, } } -template< typename Lambda > +template < size_t in_block_size, size_t out_block_size, typename Lambda > +__launch_bounds__(in_block_size*out_block_size) __global__ void poly_2mm_2_lam(Index_type ni, Index_type nl, Lambda body) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type l = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * out_block_size + threadIdx.y; + Index_type l = blockIdx.x * in_block_size + threadIdx.x; if ( i < ni && l < nl ) { body(i, l); @@ -131,14 +140,14 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) POLY_2MM_THREADS_PER_BLOCK_HIP; POLY_2MM_1_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_2mm_1), + hipLaunchKernelGGL((poly_2mm_1), dim3(nblocks1), dim3(nthreads_per_block), 0, 0, tmp, A, B, alpha, ni, nj, nk); hipErrchk( hipGetLastError() ); POLY_2MM_2_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_2mm_2), + hipLaunchKernelGGL((poly_2mm_2), dim3(nblocks2), dim3(nthreads_per_block), 0, 0, tmp, C, D, beta, ni, nl, nj); @@ -167,7 +176,7 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) }; POLY_2MM_1_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_2mm_1_lam), + hipLaunchKernelGGL((poly_2mm_1_lam), dim3(nblocks1), dim3(nthreads_per_block), 0, 0, ni, nj, poly_2mm_1_lambda); hipErrchk( hipGetLastError() ); @@ -181,7 +190,7 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) }; POLY_2MM_2_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_2mm_2_lam), + hipLaunchKernelGGL((poly_2mm_2_lam), dim3(nblocks2), dim3(nthreads_per_block), 0, 0, ni, nl, poly_2mm_2_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/polybench/POLYBENCH_3MM-Cuda.cpp b/src/polybench/POLYBENCH_3MM-Cuda.cpp index e9df41d90..4726b98f3 100644 --- a/src/polybench/POLYBENCH_3MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_3MM-Cuda.cpp @@ -27,8 +27,11 @@ namespace polybench #define in_block_sz (32) #define out_block_sz (block_size / in_block_sz) +#define POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \ + in_block_sz, out_block_sz + #define POLY_3MM_THREADS_PER_BLOCK_CUDA \ - dim3 nthreads_per_block(in_block_sz, out_block_sz, 1); + dim3 nthreads_per_block(POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA, 1); #define POLY_3MM_1_NBLOCKS_CUDA \ dim3 nblocks1(static_cast(RAJA_DIVIDE_CEILING_INT(nj, in_block_sz)), \ @@ -66,11 +69,13 @@ namespace polybench deallocCudaDeviceData(F); \ deallocCudaDeviceData(G); +template < size_t in_block_size, size_t out_block_size > +__launch_bounds__(in_block_size*out_block_size) __global__ void poly_3mm_1(Real_ptr E, Real_ptr A, Real_ptr B, Index_type ni, Index_type nj, Index_type nk) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * out_block_size + threadIdx.y; + Index_type j = blockIdx.x * in_block_size + threadIdx.x; if ( i < ni && j < nj ) { POLYBENCH_3MM_BODY1; @@ -81,23 +86,26 @@ __global__ void poly_3mm_1(Real_ptr E, Real_ptr A, Real_ptr B, } } -template< typename Lambda > +template < size_t in_block_size, size_t out_block_size, typename Lambda > +__launch_bounds__(in_block_size*out_block_size) __global__ void poly_3mm_1_lam(Index_type ni, Index_type nj, Lambda body) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * out_block_size + threadIdx.y; + Index_type j = blockIdx.x * in_block_size + threadIdx.x; if ( i < ni && j < nj ) { body(i, j); } } +template < size_t in_block_size, size_t out_block_size > +__launch_bounds__(in_block_size*out_block_size) __global__ void poly_3mm_2(Real_ptr F, Real_ptr C, Real_ptr D, Index_type nj, Index_type nl, Index_type nm) { - Index_type j = blockIdx.y * blockDim.y + threadIdx.y; - Index_type l = blockIdx.x * blockDim.x + threadIdx.x; + Index_type j = blockIdx.y * out_block_size + threadIdx.y; + Index_type l = blockIdx.x * in_block_size + threadIdx.x; if ( j < nj && l < nl ) { POLYBENCH_3MM_BODY4; @@ -108,23 +116,26 @@ __global__ void poly_3mm_2(Real_ptr F, Real_ptr C, Real_ptr D, } } -template< typename Lambda > +template < size_t in_block_size, size_t out_block_size, typename Lambda > +__launch_bounds__(in_block_size*out_block_size) __global__ void poly_3mm_2_lam(Index_type nj, Index_type nl, Lambda body) { - Index_type j = blockIdx.y * blockDim.y + threadIdx.y; - Index_type l = blockIdx.x * blockDim.x + threadIdx.x; + Index_type j = blockIdx.y * out_block_size + threadIdx.y; + Index_type l = blockIdx.x * in_block_size + threadIdx.x; if ( j < nj && l < nl ) { body(j, l); } } +template < size_t in_block_size, size_t out_block_size > +__launch_bounds__(in_block_size*out_block_size) __global__ void poly_3mm_3(Real_ptr G, Real_ptr E, Real_ptr F, Index_type ni, Index_type nl, Index_type nj) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type l = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * out_block_size + threadIdx.y; + Index_type l = blockIdx.x * in_block_size + threadIdx.x; if ( i < ni && l < nl ) { POLYBENCH_3MM_BODY7; @@ -135,12 +146,13 @@ __global__ void poly_3mm_3(Real_ptr G, Real_ptr E, Real_ptr F, } } -template< typename Lambda > +template < size_t in_block_size, size_t out_block_size, typename Lambda > +__launch_bounds__(in_block_size*out_block_size) __global__ void poly_3mm_3_lam(Index_type ni, Index_type nl, Lambda body) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type l = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * out_block_size + threadIdx.y; + Index_type l = blockIdx.x * in_block_size + threadIdx.x; if ( i < ni && l < nl ) { body(i, l); @@ -166,17 +178,20 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid) POLY_3MM_THREADS_PER_BLOCK_CUDA; POLY_3MM_1_NBLOCKS_CUDA; - poly_3mm_1<<>>(E, A, B, + poly_3mm_1 + <<>>(E, A, B, ni, nj, nk); cudaErrchk( cudaGetLastError() ); POLY_3MM_2_NBLOCKS_CUDA; - poly_3mm_2<<>>(F, C, D, + poly_3mm_2 + <<>>(F, C, D, nj, nl, nm); cudaErrchk( cudaGetLastError() ); POLY_3MM_3_NBLOCKS_CUDA; - poly_3mm_3<<>>(G, E, F, + poly_3mm_3 + <<>>(G, E, F, ni, nl, nj); cudaErrchk( cudaGetLastError() ); @@ -195,7 +210,8 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid) POLY_3MM_THREADS_PER_BLOCK_CUDA; POLY_3MM_1_NBLOCKS_CUDA; - poly_3mm_1_lam<<>>(ni, nj, + poly_3mm_1_lam + <<>>(ni, nj, [=] __device__ (Index_type i, Index_type j) { POLYBENCH_3MM_BODY1; for (Index_type k=0; k < nk; ++k) { @@ -207,7 +223,8 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid) cudaErrchk( cudaGetLastError() ); POLY_3MM_2_NBLOCKS_CUDA; - poly_3mm_2_lam<<>>(nj, nl, + poly_3mm_2_lam + <<>>(nj, nl, [=] __device__ (Index_type j, Index_type l) { POLYBENCH_3MM_BODY4; for (Index_type m=0; m < nm; ++m) { @@ -219,7 +236,8 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid) cudaErrchk( cudaGetLastError() ); POLY_3MM_3_NBLOCKS_CUDA; - poly_3mm_3_lam<<>>(ni, nl, + poly_3mm_3_lam + <<>>(ni, nl, [=] __device__ (Index_type i, Index_type l) { POLYBENCH_3MM_BODY7; for (Index_type j=0; j < nj; ++j) { diff --git a/src/polybench/POLYBENCH_3MM-Hip.cpp b/src/polybench/POLYBENCH_3MM-Hip.cpp index db3df8404..6df41cc52 100644 --- a/src/polybench/POLYBENCH_3MM-Hip.cpp +++ b/src/polybench/POLYBENCH_3MM-Hip.cpp @@ -27,8 +27,11 @@ namespace polybench #define in_block_sz (32) #define out_block_sz (block_size / in_block_sz) +#define POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \ + in_block_sz, out_block_sz + #define POLY_3MM_THREADS_PER_BLOCK_HIP \ - dim3 nthreads_per_block(in_block_sz, out_block_sz, 1); + dim3 nthreads_per_block(POLY_3MM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, 1); #define POLY_3MM_1_NBLOCKS_HIP \ dim3 nblocks1(static_cast(RAJA_DIVIDE_CEILING_INT(nj, in_block_sz)), \ @@ -66,11 +69,13 @@ namespace polybench deallocHipDeviceData(F); \ deallocHipDeviceData(G); +template < size_t in_block_size, size_t out_block_size > +__launch_bounds__(in_block_size*out_block_size) __global__ void poly_3mm_1(Real_ptr E, Real_ptr A, Real_ptr B, Index_type ni, Index_type nj, Index_type nk) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * out_block_size + threadIdx.y; + Index_type j = blockIdx.x * in_block_size + threadIdx.x; if ( i < ni && j < nj ) { POLYBENCH_3MM_BODY1; @@ -81,23 +86,26 @@ __global__ void poly_3mm_1(Real_ptr E, Real_ptr A, Real_ptr B, } } -template< typename Lambda > +template < size_t in_block_size, size_t out_block_size, typename Lambda > +__launch_bounds__(in_block_size*out_block_size) __global__ void poly_3mm_1_lam(Index_type ni, Index_type nj, Lambda body) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * out_block_size + threadIdx.y; + Index_type j = blockIdx.x * in_block_size + threadIdx.x; if ( i < ni && j < nj ) { body(i, j); } } +template < size_t in_block_size, size_t out_block_size > +__launch_bounds__(in_block_size*out_block_size) __global__ void poly_3mm_2(Real_ptr F, Real_ptr C, Real_ptr D, Index_type nj, Index_type nl, Index_type nm) { - Index_type j = blockIdx.y * blockDim.y + threadIdx.y; - Index_type l = blockIdx.x * blockDim.x + threadIdx.x; + Index_type j = blockIdx.y * out_block_size + threadIdx.y; + Index_type l = blockIdx.x * in_block_size + threadIdx.x; if ( j < nj && l < nl ) { POLYBENCH_3MM_BODY4; @@ -108,23 +116,26 @@ __global__ void poly_3mm_2(Real_ptr F, Real_ptr C, Real_ptr D, } } -template< typename Lambda > +template < size_t in_block_size, size_t out_block_size, typename Lambda > +__launch_bounds__(in_block_size*out_block_size) __global__ void poly_3mm_2_lam(Index_type nj, Index_type nl, Lambda body) { - Index_type j = blockIdx.y * blockDim.y + threadIdx.y; - Index_type l = blockIdx.x * blockDim.x + threadIdx.x; + Index_type j = blockIdx.y * out_block_size + threadIdx.y; + Index_type l = blockIdx.x * in_block_size + threadIdx.x; if ( j < nj && l < nl ) { body(j, l); } } +template < size_t in_block_size, size_t out_block_size > +__launch_bounds__(in_block_size*out_block_size) __global__ void poly_3mm_3(Real_ptr G, Real_ptr E, Real_ptr F, Index_type ni, Index_type nl, Index_type nj) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type l = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * out_block_size + threadIdx.y; + Index_type l = blockIdx.x * in_block_size + threadIdx.x; if ( i < ni && l < nl ) { POLYBENCH_3MM_BODY7; @@ -135,12 +146,13 @@ __global__ void poly_3mm_3(Real_ptr G, Real_ptr E, Real_ptr F, } } -template< typename Lambda > +template < size_t in_block_size, size_t out_block_size, typename Lambda > +__launch_bounds__(in_block_size*out_block_size) __global__ void poly_3mm_3_lam(Index_type ni, Index_type nl, Lambda body) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type l = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * out_block_size + threadIdx.y; + Index_type l = blockIdx.x * in_block_size + threadIdx.x; if ( i < ni && l < nl ) { body(i, l); @@ -165,21 +177,21 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) POLY_3MM_THREADS_PER_BLOCK_HIP; POLY_3MM_1_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_1), + hipLaunchKernelGGL((poly_3mm_1), dim3(nblocks1) , dim3(nthreads_per_block), 0, 0, E, A, B, ni, nj, nk); hipErrchk( hipGetLastError() ); POLY_3MM_2_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_2), + hipLaunchKernelGGL((poly_3mm_2), dim3(nblocks2), dim3(nthreads_per_block), 0, 0, F, C, D, nj, nl, nm); hipErrchk( hipGetLastError() ); POLY_3MM_3_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_3), + hipLaunchKernelGGL((poly_3mm_3), dim3(nblocks3), dim3(nthreads_per_block), 0, 0, G, E, F, ni, nl, nj); @@ -208,7 +220,7 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) }; POLY_3MM_1_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_1_lam), + hipLaunchKernelGGL((poly_3mm_1_lam), dim3(nblocks1), dim3(nthreads_per_block), 0, 0, ni, nj, poly_3mm_1_lambda); hipErrchk( hipGetLastError() ); @@ -222,7 +234,7 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) }; POLY_3MM_2_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_2_lam), + hipLaunchKernelGGL((poly_3mm_2_lam), dim3(nblocks2), dim3(nthreads_per_block), 0, 0, nj, nl, poly_3mm_2_lambda); hipErrchk( hipGetLastError() ); @@ -236,7 +248,7 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) }; POLY_3MM_3_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_3_lam), + hipLaunchKernelGGL((poly_3mm_3_lam), dim3(nblocks3), dim3(nthreads_per_block), 0, 0, ni, nl, poly_3mm_3_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/polybench/POLYBENCH_ADI-Cuda.cpp b/src/polybench/POLYBENCH_ADI-Cuda.cpp index e6c46aed1..8b6ef76fa 100644 --- a/src/polybench/POLYBENCH_ADI-Cuda.cpp +++ b/src/polybench/POLYBENCH_ADI-Cuda.cpp @@ -35,12 +35,14 @@ namespace polybench deallocCudaDeviceData(Q); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void adi1(const Index_type n, const Real_type a, const Real_type b, const Real_type c, const Real_type d, const Real_type f, Real_ptr P, Real_ptr Q, Real_ptr U, Real_ptr V) { - Index_type i = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = 1 + blockIdx.x * block_size + threadIdx.x; if (i < n-1) { POLYBENCH_ADI_BODY2; for (Index_type j = 1; j < n-1; ++j) { @@ -53,12 +55,14 @@ __global__ void adi1(const Index_type n, } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void adi2(const Index_type n, const Real_type a, const Real_type c, const Real_type d, const Real_type e, const Real_type f, Real_ptr P, Real_ptr Q, Real_ptr U, Real_ptr V) { - Index_type i = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = 1 + blockIdx.x * block_size + threadIdx.x; if (i < n-1) { POLYBENCH_ADI_BODY6; for (Index_type j = 1; j < n-1; ++j) { @@ -71,11 +75,12 @@ __global__ void adi2(const Index_type n, } } -template< typename Lambda > +template < size_t block_size, typename Lambda > +__launch_bounds__(block_size) __global__ void adi_lam(const Index_type n, Lambda body) { - Index_type i = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = 1 + blockIdx.x * block_size + threadIdx.x; if (i < n-1) { body(i); } @@ -100,12 +105,12 @@ void POLYBENCH_ADI::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(n-2, block_size); - adi1<<>>(n, + adi1<<>>(n, a, b, c, d, f, P, Q, U, V); cudaErrchk( cudaGetLastError() ); - adi2<<>>(n, + adi2<<>>(n, a, c, d, e, f, P, Q, U, V); cudaErrchk( cudaGetLastError() ); @@ -128,7 +133,7 @@ void POLYBENCH_ADI::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(n-2, block_size); - adi_lam<<>>(n, + adi_lam<<>>(n, [=] __device__ (Index_type i) { POLYBENCH_ADI_BODY2; for (Index_type j = 1; j < n-1; ++j) { @@ -142,7 +147,7 @@ void POLYBENCH_ADI::runCudaVariantImpl(VariantID vid) ); cudaErrchk( cudaGetLastError() ); - adi_lam<<>>(n, + adi_lam<<>>(n, [=] __device__ (Index_type i) { POLYBENCH_ADI_BODY6; for (Index_type j = 1; j < n-1; ++j) { diff --git a/src/polybench/POLYBENCH_ADI-Hip.cpp b/src/polybench/POLYBENCH_ADI-Hip.cpp index 8d0deceef..44fddf3bd 100644 --- a/src/polybench/POLYBENCH_ADI-Hip.cpp +++ b/src/polybench/POLYBENCH_ADI-Hip.cpp @@ -36,12 +36,14 @@ namespace polybench deallocHipDeviceData(Q); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void adi1(const Index_type n, const Real_type a, const Real_type b, const Real_type c, const Real_type d, const Real_type f, Real_ptr P, Real_ptr Q, Real_ptr U, Real_ptr V) { - Index_type i = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = 1 + blockIdx.x * block_size + threadIdx.x; if (i < n-1) { POLYBENCH_ADI_BODY2; for (Index_type j = 1; j < n-1; ++j) { @@ -54,12 +56,14 @@ __global__ void adi1(const Index_type n, } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void adi2(const Index_type n, const Real_type a, const Real_type c, const Real_type d, const Real_type e, const Real_type f, Real_ptr P, Real_ptr Q, Real_ptr U, Real_ptr V) { - Index_type i = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = 1 + blockIdx.x * block_size + threadIdx.x; if (i < n-1) { POLYBENCH_ADI_BODY6; for (Index_type j = 1; j < n-1; ++j) { @@ -72,11 +76,12 @@ __global__ void adi2(const Index_type n, } } -template< typename Lambda > +template < size_t block_size, typename Lambda > +__launch_bounds__(block_size) __global__ void adi_lam(const Index_type n, Lambda body) { - Index_type i = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = 1 + blockIdx.x * block_size + threadIdx.x; if (i < n-1) { body(i); } @@ -101,14 +106,14 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(n-2, block_size); - hipLaunchKernelGGL((adi1), + hipLaunchKernelGGL((adi1), dim3(grid_size), dim3(block_size), 0, 0, n, a, b, c, d, f, P, Q, U, V); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((adi2), + hipLaunchKernelGGL((adi2), dim3(grid_size), dim3(block_size), 0, 0, n, a, c, d, e, f, @@ -144,7 +149,7 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) } }; - hipLaunchKernelGGL((adi_lam), + hipLaunchKernelGGL((adi_lam), dim3(grid_size), dim3(block_size), 0, 0, n, adi1_lamda); hipErrchk( hipGetLastError() ); @@ -160,7 +165,7 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) } }; - hipLaunchKernelGGL((adi_lam), + hipLaunchKernelGGL((adi_lam), dim3(grid_size), dim3(block_size), 0, 0, n, adi2_lamda); hipErrchk( hipGetLastError() ); diff --git a/src/polybench/POLYBENCH_ATAX-Cuda.cpp b/src/polybench/POLYBENCH_ATAX-Cuda.cpp index 87f26bf95..107401144 100644 --- a/src/polybench/POLYBENCH_ATAX-Cuda.cpp +++ b/src/polybench/POLYBENCH_ATAX-Cuda.cpp @@ -36,10 +36,12 @@ namespace polybench deallocCudaDeviceData(A); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void poly_atax_1(Real_ptr A, Real_ptr x, Real_ptr y, Real_ptr tmp, Index_type N) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < N) { POLYBENCH_ATAX_BODY1; @@ -50,10 +52,12 @@ __global__ void poly_atax_1(Real_ptr A, Real_ptr x, Real_ptr y, Real_ptr tmp, } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void poly_atax_2(Real_ptr A, Real_ptr tmp, Real_ptr y, Index_type N) { - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type j = blockIdx.x * block_size + threadIdx.x; if (j < N) { POLYBENCH_ATAX_BODY4; @@ -64,11 +68,12 @@ __global__ void poly_atax_2(Real_ptr A, Real_ptr tmp, Real_ptr y, } } -template< typename Lambda > +template < size_t block_size, typename Lambda > +__launch_bounds__(block_size) __global__ void poly_atax_lam(Index_type N, Lambda body) { - Index_type ti = blockIdx.x * blockDim.x + threadIdx.x; + Index_type ti = blockIdx.x * block_size + threadIdx.x; if (ti < N) { body(ti); @@ -92,10 +97,10 @@ void POLYBENCH_ATAX::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); - poly_atax_1<<>>(A, x, y, tmp, N); + poly_atax_1<<>>(A, x, y, tmp, N); cudaErrchk( cudaGetLastError() ); - poly_atax_2<<>>(A, tmp, y, N); + poly_atax_2<<>>(A, tmp, y, N); cudaErrchk( cudaGetLastError() ); } @@ -112,7 +117,7 @@ void POLYBENCH_ATAX::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); - poly_atax_lam<<>>(N, + poly_atax_lam<<>>(N, [=] __device__ (Index_type i) { POLYBENCH_ATAX_BODY1; for (Index_type j = 0; j < N; ++j ) { @@ -123,7 +128,7 @@ void POLYBENCH_ATAX::runCudaVariantImpl(VariantID vid) ); cudaErrchk( cudaGetLastError() ); - poly_atax_lam<<>>(N, + poly_atax_lam<<>>(N, [=] __device__ (Index_type j) { POLYBENCH_ATAX_BODY4; for (Index_type i = 0; i < N; ++i ) { diff --git a/src/polybench/POLYBENCH_ATAX-Hip.cpp b/src/polybench/POLYBENCH_ATAX-Hip.cpp index c709f14fe..82e50ce88 100644 --- a/src/polybench/POLYBENCH_ATAX-Hip.cpp +++ b/src/polybench/POLYBENCH_ATAX-Hip.cpp @@ -36,10 +36,12 @@ namespace polybench deallocHipDeviceData(A); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void poly_atax_1(Real_ptr A, Real_ptr x, Real_ptr y, Real_ptr tmp, Index_type N) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < N) { POLYBENCH_ATAX_BODY1; @@ -50,10 +52,12 @@ __global__ void poly_atax_1(Real_ptr A, Real_ptr x, Real_ptr y, Real_ptr tmp, } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void poly_atax_2(Real_ptr A, Real_ptr tmp, Real_ptr y, Index_type N) { - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type j = blockIdx.x * block_size + threadIdx.x; if (j < N) { POLYBENCH_ATAX_BODY4; @@ -64,11 +68,12 @@ __global__ void poly_atax_2(Real_ptr A, Real_ptr tmp, Real_ptr y, } } -template< typename Lambda > +template < size_t block_size, typename Lambda > +__launch_bounds__(block_size) __global__ void poly_atax_lam(Index_type N, Lambda body) { - Index_type ti = blockIdx.x * blockDim.x + threadIdx.x; + Index_type ti = blockIdx.x * block_size + threadIdx.x; if (ti < N) { body(ti); @@ -92,12 +97,12 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); - hipLaunchKernelGGL((poly_atax_1), + hipLaunchKernelGGL((poly_atax_1), dim3(grid_size), dim3(block_size), 0, 0, A, x, y, tmp, N); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_atax_2), + hipLaunchKernelGGL((poly_atax_2), dim3(grid_size), dim3(block_size), 0, 0, A, tmp, y, N); hipErrchk( hipGetLastError() ); @@ -124,7 +129,7 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid) POLYBENCH_ATAX_BODY3; }; - hipLaunchKernelGGL((poly_atax_lam), + hipLaunchKernelGGL((poly_atax_lam), dim3(grid_size), dim3(block_size), 0, 0, N, poly_atax_1_lambda); hipErrchk( hipGetLastError() ); @@ -137,7 +142,7 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid) POLYBENCH_ATAX_BODY6; }; - hipLaunchKernelGGL((poly_atax_lam), + hipLaunchKernelGGL((poly_atax_lam), dim3(grid_size), dim3(block_size), 0, 0, N, poly_atax_2_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp index 70aab149c..653e5dfd5 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp @@ -27,8 +27,11 @@ namespace polybench #define j_block_sz (32) #define i_block_sz (block_size / j_block_sz) +#define FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \ + j_block_sz, i_block_sz + #define FDTD_2D_THREADS_PER_BLOCK_CUDA \ - dim3 nthreads_per_block234(j_block_sz, i_block_sz, 1); + dim3 nthreads_per_block234(FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA, 1); #define FDTD_2D_NBLOCKS_CUDA \ dim3 nblocks234(static_cast(RAJA_DIVIDE_CEILING_INT(ny, j_block_sz)), \ @@ -50,89 +53,101 @@ namespace polybench deallocCudaDeviceData(fict); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void poly_fdtd2d_1(Real_ptr ey, Real_ptr fict, Index_type ny, Index_type t) { - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type j = blockIdx.x * block_size + threadIdx.x; if (j < ny) { POLYBENCH_FDTD_2D_BODY1; } } -template< typename Lambda > +template < size_t block_size, typename Lambda > +__launch_bounds__(block_size) __global__ void poly_fdtd2d_1_lam(Index_type ny, Lambda body) { - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type j = blockIdx.x * block_size + threadIdx.x; if (j < ny) { body(j); } } +template < size_t j_block_size, size_t i_block_size > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_fdtd2d_2(Real_ptr ey, Real_ptr hz, Index_type nx, Index_type ny) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * i_block_size + threadIdx.y; + Index_type j = blockIdx.x * j_block_size + threadIdx.x; if (i > 0 && i < nx && j < ny) { POLYBENCH_FDTD_2D_BODY2; } } -template< typename Lambda > +template < size_t j_block_size, size_t i_block_size, typename Lambda > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_fdtd2d_2_lam(Index_type nx, Index_type ny, Lambda body) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * i_block_size + threadIdx.y; + Index_type j = blockIdx.x * j_block_size + threadIdx.x; if (i > 0 && i < nx && j < ny) { body(i, j); } } +template < size_t j_block_size, size_t i_block_size > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_fdtd2d_3(Real_ptr ex, Real_ptr hz, Index_type nx, Index_type ny) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * i_block_size + threadIdx.y; + Index_type j = blockIdx.x * j_block_size + threadIdx.x; if (i < nx && j > 0 && j < ny) { POLYBENCH_FDTD_2D_BODY3; } } -template< typename Lambda > +template < size_t j_block_size, size_t i_block_size, typename Lambda > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_fdtd2d_3_lam(Index_type nx, Index_type ny, Lambda body) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * i_block_size + threadIdx.y; + Index_type j = blockIdx.x * j_block_size + threadIdx.x; if (i < nx && j > 0 && j < ny) { body(i, j); } } +template < size_t j_block_size, size_t i_block_size > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_fdtd2d_4(Real_ptr hz, Real_ptr ex, Real_ptr ey, Index_type nx, Index_type ny) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * i_block_size + threadIdx.y; + Index_type j = blockIdx.x * j_block_size + threadIdx.x; if (i < nx-1 && j < ny-1) { POLYBENCH_FDTD_2D_BODY4; } } -template< typename Lambda > +template < size_t j_block_size, size_t i_block_size, typename Lambda > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_fdtd2d_4_lam(Index_type nx, Index_type ny, Lambda body) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * i_block_size + threadIdx.y; + Index_type j = blockIdx.x * j_block_size + threadIdx.x; if (i < nx-1 && j < ny-1) { body(i, j); @@ -158,19 +173,22 @@ void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid) const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(ny, block_size); - poly_fdtd2d_1<<>>(ey, fict, ny, t); + poly_fdtd2d_1<<>>(ey, fict, ny, t); cudaErrchk( cudaGetLastError() ); FDTD_2D_THREADS_PER_BLOCK_CUDA; FDTD_2D_NBLOCKS_CUDA; - poly_fdtd2d_2<<>>(ey, hz, nx, ny); + poly_fdtd2d_2 + <<>>(ey, hz, nx, ny); cudaErrchk( cudaGetLastError() ); - poly_fdtd2d_3<<>>(ex, hz, nx, ny); + poly_fdtd2d_3 + <<>>(ex, hz, nx, ny); cudaErrchk( cudaGetLastError() ); - poly_fdtd2d_4<<>>(hz, ex, ey, nx, ny); + poly_fdtd2d_4 + <<>>(hz, ex, ey, nx, ny); cudaErrchk( cudaGetLastError() ); } // tstep loop @@ -191,7 +209,7 @@ void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid) const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(ny, block_size); - poly_fdtd2d_1_lam<<>>(ny, + poly_fdtd2d_1_lam<<>>(ny, [=] __device__ (Index_type j) { POLYBENCH_FDTD_2D_BODY1; } @@ -200,21 +218,24 @@ void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid) FDTD_2D_THREADS_PER_BLOCK_CUDA; FDTD_2D_NBLOCKS_CUDA; - poly_fdtd2d_2_lam<<>>(nx, ny, + poly_fdtd2d_2_lam + <<>>(nx, ny, [=] __device__ (Index_type i, Index_type j) { POLYBENCH_FDTD_2D_BODY2; } ); cudaErrchk( cudaGetLastError() ); - poly_fdtd2d_3_lam<<>>(nx, ny, + poly_fdtd2d_3_lam + <<>>(nx, ny, [=] __device__ (Index_type i, Index_type j) { POLYBENCH_FDTD_2D_BODY3; } ); cudaErrchk( cudaGetLastError() ); - poly_fdtd2d_4_lam<<>>(nx, ny, + poly_fdtd2d_4_lam + <<>>(nx, ny, [=] __device__ (Index_type i, Index_type j) { POLYBENCH_FDTD_2D_BODY4; } diff --git a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp index b2493fc63..2685fe5e2 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp @@ -27,8 +27,11 @@ namespace polybench #define j_block_sz (32) #define i_block_sz (block_size / j_block_sz) +#define FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \ + j_block_sz, i_block_sz + #define FDTD_2D_THREADS_PER_BLOCK_HIP \ - dim3 nthreads_per_block234(j_block_sz, i_block_sz, 1); + dim3 nthreads_per_block234(FDTD_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, 1); #define FDTD_2D_NBLOCKS_HIP \ dim3 nblocks234(static_cast(RAJA_DIVIDE_CEILING_INT(ny, j_block_sz)), \ @@ -49,89 +52,101 @@ namespace polybench deallocHipDeviceData(fict); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void poly_fdtd2d_1(Real_ptr ey, Real_ptr fict, Index_type ny, Index_type t) { - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type j = blockIdx.x * block_size + threadIdx.x; if (j < ny) { POLYBENCH_FDTD_2D_BODY1; } } -template< typename Lambda > +template < size_t block_size, typename Lambda > +__launch_bounds__(block_size) __global__ void poly_fdtd2d_1_lam(Index_type ny, Lambda body) { - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type j = blockIdx.x * block_size + threadIdx.x; if (j < ny) { body(j); } } +template < size_t j_block_size, size_t i_block_size > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_fdtd2d_2(Real_ptr ey, Real_ptr hz, Index_type nx, Index_type ny) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * i_block_size + threadIdx.y; + Index_type j = blockIdx.x * j_block_size + threadIdx.x; if (i > 0 && i < nx && j < ny) { POLYBENCH_FDTD_2D_BODY2; } } -template< typename Lambda > +template < size_t j_block_size, size_t i_block_size, typename Lambda > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_fdtd2d_2_lam(Index_type nx, Index_type ny, Lambda body) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * i_block_size + threadIdx.y; + Index_type j = blockIdx.x * j_block_size + threadIdx.x; if (i > 0 && i < nx && j < ny) { body(i, j); } } +template < size_t j_block_size, size_t i_block_size > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_fdtd2d_3(Real_ptr ex, Real_ptr hz, Index_type nx, Index_type ny) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * i_block_size + threadIdx.y; + Index_type j = blockIdx.x * j_block_size + threadIdx.x; if (i < nx && j > 0 && j < ny) { POLYBENCH_FDTD_2D_BODY3; } } -template< typename Lambda > +template < size_t j_block_size, size_t i_block_size, typename Lambda > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_fdtd2d_3_lam(Index_type nx, Index_type ny, Lambda body) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * i_block_size + threadIdx.y; + Index_type j = blockIdx.x * j_block_size + threadIdx.x; if (i < nx && j > 0 && j < ny) { body(i, j); } } +template < size_t j_block_size, size_t i_block_size > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_fdtd2d_4(Real_ptr hz, Real_ptr ex, Real_ptr ey, Index_type nx, Index_type ny) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * i_block_size + threadIdx.y; + Index_type j = blockIdx.x * j_block_size + threadIdx.x; if (i < nx-1 && j < ny-1) { POLYBENCH_FDTD_2D_BODY4; } } -template< typename Lambda > +template < size_t j_block_size, size_t i_block_size, typename Lambda > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_fdtd2d_4_lam(Index_type nx, Index_type ny, Lambda body) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * i_block_size + threadIdx.y; + Index_type j = blockIdx.x * j_block_size + threadIdx.x; if (i < nx-1 && j < ny-1) { body(i, j); @@ -156,7 +171,7 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) for (t = 0; t < tsteps; ++t) { const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(ny, block_size); - hipLaunchKernelGGL((poly_fdtd2d_1), + hipLaunchKernelGGL((poly_fdtd2d_1), dim3(grid_size1), dim3(block_size), 0, 0, ey, fict, ny, t); hipErrchk( hipGetLastError() ); @@ -164,17 +179,17 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) FDTD_2D_THREADS_PER_BLOCK_HIP; FDTD_2D_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_fdtd2d_2), + hipLaunchKernelGGL((poly_fdtd2d_2), dim3(nblocks234), dim3(nthreads_per_block234), 0, 0, ey, hz, nx, ny); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_fdtd2d_3), + hipLaunchKernelGGL((poly_fdtd2d_3), dim3(nblocks234), dim3(nthreads_per_block234), 0, 0, ex, hz, nx, ny); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_fdtd2d_4), + hipLaunchKernelGGL((poly_fdtd2d_4), dim3(nblocks234), dim3(nthreads_per_block234), 0, 0, hz, ex, ey, nx, ny); hipErrchk( hipGetLastError() ); @@ -201,7 +216,7 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) POLYBENCH_FDTD_2D_BODY1; }; - hipLaunchKernelGGL(poly_fdtd2d_1_lam, + hipLaunchKernelGGL((poly_fdtd2d_1_lam), dim3(grid_size1), dim3(block_size), 0, 0, ny, poly_fdtd2d_1_lambda); hipErrchk( hipGetLastError() ); @@ -214,7 +229,7 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) POLYBENCH_FDTD_2D_BODY2; }; - hipLaunchKernelGGL((poly_fdtd2d_2_lam), + hipLaunchKernelGGL((poly_fdtd2d_2_lam), dim3(nblocks234), dim3(nthreads_per_block234), 0, 0, nx, ny, poly_fdtd2d_2_lambda); hipErrchk( hipGetLastError() ); @@ -224,7 +239,7 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) POLYBENCH_FDTD_2D_BODY3; }; - hipLaunchKernelGGL((poly_fdtd2d_3_lam), + hipLaunchKernelGGL((poly_fdtd2d_3_lam), dim3(nblocks234), dim3(nthreads_per_block234), 0, 0, nx, ny, poly_fdtd2d_3_lambda); hipErrchk( hipGetLastError() ); @@ -234,7 +249,7 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) POLYBENCH_FDTD_2D_BODY4; }; - hipLaunchKernelGGL((poly_fdtd2d_4_lam), + hipLaunchKernelGGL((poly_fdtd2d_4_lam), dim3(nblocks234), dim3(nthreads_per_block234), 0, 0, nx, ny, poly_fdtd2d_4_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp index e0d52d4f6..cf1af3ec2 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp @@ -25,8 +25,11 @@ namespace polybench #define j_block_sz (32) #define i_block_sz (block_size / j_block_sz) +#define POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \ + j_block_sz, i_block_sz + #define POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_CUDA \ - dim3 nthreads_per_block(j_block_sz, i_block_sz, 1); + dim3 nthreads_per_block(POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA, 1); #define POLY_FLOYD_WARSHALL_NBLOCKS_CUDA \ dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)), \ @@ -45,24 +48,27 @@ namespace polybench deallocCudaDeviceData(pout); +template < size_t j_block_size, size_t i_block_size > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_floyd_warshall(Real_ptr pout, Real_ptr pin, Index_type k, Index_type N) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * i_block_size + threadIdx.y; + Index_type j = blockIdx.x * j_block_size + threadIdx.x; if ( i < N && j < N ) { POLYBENCH_FLOYD_WARSHALL_BODY; } } -template< typename Lambda > +template < size_t j_block_size, size_t i_block_size, typename Lambda > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_floyd_warshall_lam(Index_type N, Lambda body) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * i_block_size + threadIdx.y; + Index_type j = blockIdx.x * j_block_size + threadIdx.x; if ( i < N && j < N ) { body(i, j); @@ -89,7 +95,8 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariantImpl(VariantID vid) POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_CUDA; POLY_FLOYD_WARSHALL_NBLOCKS_CUDA; - poly_floyd_warshall<<>>(pout, pin, + poly_floyd_warshall + <<>>(pout, pin, k, N); cudaErrchk( cudaGetLastError() ); @@ -112,7 +119,8 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariantImpl(VariantID vid) POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_CUDA; POLY_FLOYD_WARSHALL_NBLOCKS_CUDA; - poly_floyd_warshall_lam<<>>(N, + poly_floyd_warshall_lam + <<>>(N, [=] __device__ (Index_type i, Index_type j) { POLYBENCH_FLOYD_WARSHALL_BODY; } diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp index 7dd03d9be..25538abcf 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp @@ -25,8 +25,11 @@ namespace polybench #define j_block_sz (32) #define i_block_sz (block_size / j_block_sz) +#define POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \ + j_block_sz, i_block_sz + #define POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_HIP \ - dim3 nthreads_per_block(j_block_sz, i_block_sz, 1); + dim3 nthreads_per_block(POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, 1); #define POLY_FLOYD_WARSHALL_NBLOCKS_HIP \ dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)), \ @@ -44,24 +47,27 @@ namespace polybench deallocHipDeviceData(pin); \ deallocHipDeviceData(pout); +template < size_t j_block_size, size_t i_block_size > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_floyd_warshall(Real_ptr pout, Real_ptr pin, Index_type k, Index_type N) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * i_block_size + threadIdx.y; + Index_type j = blockIdx.x * j_block_size + threadIdx.x; if ( i < N && j < N ) { POLYBENCH_FLOYD_WARSHALL_BODY; } } -template< typename Lambda > +template < size_t j_block_size, size_t i_block_size, typename Lambda > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_floyd_warshall_lam(Index_type N, Lambda body) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * i_block_size + threadIdx.y; + Index_type j = blockIdx.x * j_block_size + threadIdx.x; if ( i < N && j < N ) { body(i, j); @@ -88,7 +94,7 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariantImpl(VariantID vid) POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_HIP; POLY_FLOYD_WARSHALL_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_floyd_warshall), + hipLaunchKernelGGL((poly_floyd_warshall), dim3(nblocks), dim3(nthreads_per_block), 0, 0, pout, pin, k, N); @@ -119,7 +125,7 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariantImpl(VariantID vid) POLY_FLOYD_WARSHALL_NBLOCKS_HIP; hipLaunchKernelGGL( - (poly_floyd_warshall_lam), + (poly_floyd_warshall_lam), dim3(nblocks), dim3(nthreads_per_block), 0, 0, N, poly_floyd_warshall_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/polybench/POLYBENCH_GEMM-Cuda.cpp b/src/polybench/POLYBENCH_GEMM-Cuda.cpp index deaeb7af3..a87425ebb 100644 --- a/src/polybench/POLYBENCH_GEMM-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMM-Cuda.cpp @@ -27,8 +27,11 @@ namespace polybench #define j_block_sz (32) #define i_block_sz (block_size / j_block_sz) +#define POLY_GEMM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \ + j_block_sz, i_block_sz + #define POLY_GEMM_THREADS_PER_BLOCK_CUDA \ - dim3 nthreads_per_block(j_block_sz, i_block_sz, 1); + dim3 nthreads_per_block(POLY_GEMM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA, 1); #define POLY_GEMM_NBLOCKS_CUDA \ dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(nj, j_block_sz)), \ @@ -49,12 +52,14 @@ namespace polybench deallocCudaDeviceData(C); +template < size_t j_block_size, size_t i_block_size > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_gemm(Real_ptr C, Real_ptr A, Real_ptr B, Real_type alpha, Real_type beta, Index_type ni, Index_type nj, Index_type nk) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * i_block_size + threadIdx.y; + Index_type j = blockIdx.x * j_block_size + threadIdx.x; if ( i < ni && j < nj ) { POLYBENCH_GEMM_BODY1; @@ -66,12 +71,13 @@ __global__ void poly_gemm(Real_ptr C, Real_ptr A, Real_ptr B, } } -template< typename Lambda > +template < size_t j_block_size, size_t i_block_size, typename Lambda > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_gemm_lam(Index_type ni, Index_type nj, Lambda body) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * i_block_size + threadIdx.y; + Index_type j = blockIdx.x * j_block_size + threadIdx.x; if ( i < ni && j < nj ) { body(i, j); @@ -96,7 +102,8 @@ void POLYBENCH_GEMM::runCudaVariantImpl(VariantID vid) POLY_GEMM_THREADS_PER_BLOCK_CUDA; POLY_GEMM_NBLOCKS_CUDA; - poly_gemm<<>>(C, A, B, + poly_gemm + <<>>(C, A, B, alpha, beta, ni, nj, nk); cudaErrchk( cudaGetLastError() ); @@ -116,7 +123,8 @@ void POLYBENCH_GEMM::runCudaVariantImpl(VariantID vid) POLY_GEMM_THREADS_PER_BLOCK_CUDA; POLY_GEMM_NBLOCKS_CUDA; - poly_gemm_lam<<>>(ni, nj, + poly_gemm_lam + <<>>(ni, nj, [=] __device__ (Index_type i, Index_type j) { POLYBENCH_GEMM_BODY1; POLYBENCH_GEMM_BODY2; diff --git a/src/polybench/POLYBENCH_GEMM-Hip.cpp b/src/polybench/POLYBENCH_GEMM-Hip.cpp index 9baf86628..e147188e4 100644 --- a/src/polybench/POLYBENCH_GEMM-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMM-Hip.cpp @@ -27,8 +27,11 @@ namespace polybench #define j_block_sz (32) #define i_block_sz (block_size / j_block_sz) +#define POLY_GEMM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \ + j_block_sz, i_block_sz + #define POLY_GEMM_THREADS_PER_BLOCK_HIP \ - dim3 nthreads_per_block(j_block_sz, i_block_sz, 1); + dim3 nthreads_per_block(POLY_GEMM_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, 1); #define POLY_GEMM_NBLOCKS_HIP \ dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(nj, j_block_sz)), \ @@ -49,6 +52,8 @@ namespace polybench deallocHipDeviceData(C); +template < size_t j_block_size, size_t i_block_size > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_gemm(Real_ptr C, Real_ptr A, Real_ptr B, Real_type alpha, Real_type beta, Index_type ni, Index_type nj, Index_type nk) @@ -66,7 +71,8 @@ __global__ void poly_gemm(Real_ptr C, Real_ptr A, Real_ptr B, } } -template< typename Lambda > +template < size_t j_block_size, size_t i_block_size, typename Lambda > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_gemm_lam(Index_type ni, Index_type nj, Lambda body) { @@ -96,7 +102,7 @@ void POLYBENCH_GEMM::runHipVariantImpl(VariantID vid) POLY_GEMM_THREADS_PER_BLOCK_HIP; POLY_GEMM_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_gemm), + hipLaunchKernelGGL((poly_gemm), dim3(nblocks), dim3(nthreads_per_block), 0, 0, C, A, B, alpha, beta, ni, nj, nk); @@ -126,7 +132,7 @@ void POLYBENCH_GEMM::runHipVariantImpl(VariantID vid) POLYBENCH_GEMM_BODY4; }; - hipLaunchKernelGGL((poly_gemm_lam), + hipLaunchKernelGGL((poly_gemm_lam), dim3(nblocks), dim3(nthreads_per_block), 0, 0, ni, nj, poly_gemm_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp index 7c7763955..edaef8e08 100644 --- a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp @@ -27,8 +27,11 @@ namespace polybench #define j_block_sz (32) #define i_block_sz (block_size / j_block_sz) +#define GEMVER_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \ + j_block_sz, i_block_sz + #define GEMVER_THREADS_PER_BLOCK_CUDA \ - dim3 nthreads_per_block1(j_block_sz, i_block_sz, 1); + dim3 nthreads_per_block1(GEMVER_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA, 1); #define GEMVER_NBLOCKS_CUDA \ dim3 nblocks1(static_cast(RAJA_DIVIDE_CEILING_INT(n, j_block_sz)), \ @@ -60,36 +63,41 @@ namespace polybench deallocCudaDeviceData(y); \ deallocCudaDeviceData(z); +template < size_t j_block_size, size_t i_block_size > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_gemmver_1(Real_ptr A, Real_ptr u1, Real_ptr v1, Real_ptr u2, Real_ptr v2, Index_type n) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * i_block_size + threadIdx.y; + Index_type j = blockIdx.x * j_block_size + threadIdx.x; if (i < n && j < n) { POLYBENCH_GEMVER_BODY1; } } -template< typename Lambda > +template < size_t j_block_size, size_t i_block_size, typename Lambda > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_gemmver_1_lam(Index_type n, Lambda body) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * i_block_size + threadIdx.y; + Index_type j = blockIdx.x * j_block_size + threadIdx.x; if (i < n && j < n) { body(i, j); } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void poly_gemmver_2(Real_ptr A, Real_ptr x, Real_ptr y, Real_type beta, Index_type n) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { POLYBENCH_GEMVER_BODY2; for (Index_type j = 0; j < n; ++j) { @@ -99,21 +107,25 @@ __global__ void poly_gemmver_2(Real_ptr A, } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void poly_gemmver_3(Real_ptr x, Real_ptr z, Index_type n) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { POLYBENCH_GEMVER_BODY5; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void poly_gemmver_4(Real_ptr A, Real_ptr x, Real_ptr w, Real_type alpha, Index_type n) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { POLYBENCH_GEMVER_BODY6; for (Index_type j = 0; j < n; ++j) { @@ -123,10 +135,11 @@ __global__ void poly_gemmver_4(Real_ptr A, } } -template< typename Lambda > +template < size_t block_size, typename Lambda > +__launch_bounds__(block_size) __global__ void poly_gemmver_234_lam(Index_type n, Lambda body) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { body(i); } @@ -150,22 +163,23 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) GEMVER_THREADS_PER_BLOCK_CUDA; GEMVER_NBLOCKS_CUDA; - poly_gemmver_1<<>>(A, u1, v1, u2, v2, + poly_gemmver_1 + <<>>(A, u1, v1, u2, v2, n); cudaErrchk( cudaGetLastError() ); size_t grid_size = RAJA_DIVIDE_CEILING_INT(n, block_size); - poly_gemmver_2<<>>(A, x, y, + poly_gemmver_2<<>>(A, x, y, beta, n); cudaErrchk( cudaGetLastError() ); - poly_gemmver_3<<>>(x, z, + poly_gemmver_3<<>>(x, z, n); cudaErrchk( cudaGetLastError() ); - poly_gemmver_4<<>>(A, x, w, + poly_gemmver_4<<>>(A, x, w, alpha, n); cudaErrchk( cudaGetLastError() ); @@ -185,7 +199,8 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) GEMVER_THREADS_PER_BLOCK_CUDA; GEMVER_NBLOCKS_CUDA; - poly_gemmver_1_lam<<>>(n, + poly_gemmver_1_lam + <<>>(n, [=] __device__ (Index_type i, Index_type j) { POLYBENCH_GEMVER_BODY1; } @@ -194,7 +209,7 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) size_t grid_size = RAJA_DIVIDE_CEILING_INT(n, block_size); - poly_gemmver_234_lam<<>>(n, + poly_gemmver_234_lam<<>>(n, [=] __device__ (Index_type i) { POLYBENCH_GEMVER_BODY2; for (Index_type j = 0; j < n; ++j) { @@ -205,14 +220,14 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) ); cudaErrchk( cudaGetLastError() ); - poly_gemmver_234_lam<<>>(n, + poly_gemmver_234_lam<<>>(n, [=] __device__ (Index_type i) { POLYBENCH_GEMVER_BODY5; } ); cudaErrchk( cudaGetLastError() ); - poly_gemmver_234_lam<<>>(n, + poly_gemmver_234_lam<<>>(n, [=] __device__ (Index_type i) { POLYBENCH_GEMVER_BODY6; for (Index_type j = 0; j < n; ++j) { diff --git a/src/polybench/POLYBENCH_GEMVER-Hip.cpp b/src/polybench/POLYBENCH_GEMVER-Hip.cpp index 11e4f0c85..f9236578e 100644 --- a/src/polybench/POLYBENCH_GEMVER-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Hip.cpp @@ -27,8 +27,11 @@ namespace polybench #define j_block_sz (32) #define i_block_sz (block_size / j_block_sz) +#define GEMVER_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \ + j_block_sz, i_block_sz + #define GEMVER_THREADS_PER_BLOCK_HIP \ - dim3 nthreads_per_block1(j_block_sz, i_block_sz, 1); + dim3 nthreads_per_block1(GEMVER_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, 1); #define GEMVER_NBLOCKS_HIP \ dim3 nblocks1(static_cast(RAJA_DIVIDE_CEILING_INT(n, j_block_sz)), \ @@ -60,36 +63,41 @@ namespace polybench deallocHipDeviceData(y); \ deallocHipDeviceData(z); +template < size_t j_block_size, size_t i_block_size > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_gemmver_1(Real_ptr A, Real_ptr u1, Real_ptr v1, Real_ptr u2, Real_ptr v2, Index_type n) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * i_block_size + threadIdx.y; + Index_type j = blockIdx.x * j_block_size + threadIdx.x; if (i < n && j < n) { POLYBENCH_GEMVER_BODY1; } } -template< typename Lambda > +template < size_t j_block_size, size_t i_block_size, typename Lambda > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_gemmver_1_lam(Index_type n, Lambda body) { - Index_type i = blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.y * i_block_size + threadIdx.y; + Index_type j = blockIdx.x * j_block_size + threadIdx.x; if (i < n && j < n) { body(i, j); } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void poly_gemmver_2(Real_ptr A, Real_ptr x, Real_ptr y, Real_type beta, Index_type n) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { POLYBENCH_GEMVER_BODY2; for (Index_type j = 0; j < n; ++j) { @@ -99,21 +107,25 @@ __global__ void poly_gemmver_2(Real_ptr A, } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void poly_gemmver_3(Real_ptr x, Real_ptr z, Index_type n) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { POLYBENCH_GEMVER_BODY5; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void poly_gemmver_4(Real_ptr A, Real_ptr x, Real_ptr w, Real_type alpha, Index_type n) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { POLYBENCH_GEMVER_BODY6; for (Index_type j = 0; j < n; ++j) { @@ -123,10 +135,11 @@ __global__ void poly_gemmver_4(Real_ptr A, } } -template< typename Lambda > +template < size_t block_size, typename Lambda > +__launch_bounds__(block_size) __global__ void poly_gemmver_234_lam(Index_type n, Lambda body) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { body(i); } @@ -150,24 +163,24 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) GEMVER_THREADS_PER_BLOCK_HIP; GEMVER_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_gemmver_1), + hipLaunchKernelGGL((poly_gemmver_1), dim3(nblocks1), dim3(nthreads_per_block1), 0, 0, A, u1, v1, u2, v2, n); hipErrchk( hipGetLastError() ); size_t grid_size = RAJA_DIVIDE_CEILING_INT(m_n, block_size); - hipLaunchKernelGGL((poly_gemmver_2), + hipLaunchKernelGGL((poly_gemmver_2), dim3(grid_size), dim3(block_size), 0, 0, A, x, y, beta, n); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_gemmver_3), + hipLaunchKernelGGL((poly_gemmver_3), dim3(grid_size), dim3(block_size), 0, 0, x, z, n); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_gemmver_4), + hipLaunchKernelGGL((poly_gemmver_4), dim3(grid_size), dim3(block_size), 0, 0, A, x, w, alpha, n); hipErrchk( hipGetLastError() ); @@ -191,7 +204,7 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) POLYBENCH_GEMVER_BODY1; }; - hipLaunchKernelGGL(poly_gemmver_1_lam, + hipLaunchKernelGGL((poly_gemmver_1_lam), dim3(nblocks1), dim3(nthreads_per_block1), 0, 0, n, poly_gemmver_1_lambda); hipErrchk( hipGetLastError() ); @@ -206,7 +219,7 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) POLYBENCH_GEMVER_BODY4; }; - hipLaunchKernelGGL(poly_gemmver_234_lam, + hipLaunchKernelGGL((poly_gemmver_234_lam), dim3(grid_size), dim3(block_size), 0, 0, n, poly_gemmver_2_lambda); hipErrchk( hipGetLastError() ); @@ -215,7 +228,7 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) POLYBENCH_GEMVER_BODY5; }; - hipLaunchKernelGGL(poly_gemmver_234_lam, + hipLaunchKernelGGL((poly_gemmver_234_lam), dim3(grid_size), dim3(block_size), 0, 0, n, poly_gemmver_3_lambda); hipErrchk( hipGetLastError() ); @@ -228,7 +241,7 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) POLYBENCH_GEMVER_BODY8; }; - hipLaunchKernelGGL(poly_gemmver_234_lam, + hipLaunchKernelGGL((poly_gemmver_234_lam), dim3(grid_size), dim3(block_size), 0, 0, n, poly_gemmver_4_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp index b53c89ed4..69063cdda 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp @@ -36,12 +36,14 @@ namespace polybench deallocCudaDeviceData(B); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void poly_gesummv(Real_ptr x, Real_ptr y, Real_ptr A, Real_ptr B, Real_type alpha, Real_type beta, Index_type N) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < N) { POLYBENCH_GESUMMV_BODY1; @@ -69,7 +71,7 @@ void POLYBENCH_GESUMMV::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); - poly_gesummv<<>>(x, y, + poly_gesummv<<>>(x, y, A, B, alpha, beta, N); diff --git a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp index 461923cef..4c47bb01b 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp @@ -36,12 +36,14 @@ namespace polybench deallocHipDeviceData(B); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void poly_gesummv(Real_ptr x, Real_ptr y, Real_ptr A, Real_ptr B, Real_type alpha, Real_type beta, Index_type N) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < N) { POLYBENCH_GESUMMV_BODY1; @@ -69,7 +71,7 @@ void POLYBENCH_GESUMMV::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); - hipLaunchKernelGGL((poly_gesummv), + hipLaunchKernelGGL((poly_gesummv), dim3(grid_size), dim3(block_size),0,0, x, y, A, B, diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp index 7bdc30d91..61238900a 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp @@ -33,18 +33,22 @@ namespace polybench deallocCudaDeviceData(B); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void poly_jacobi_1D_1(Real_ptr A, Real_ptr B, Index_type N) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i > 0 && i < N-1) { POLYBENCH_JACOBI_1D_BODY1; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void poly_jacobi_1D_2(Real_ptr A, Real_ptr B, Index_type N) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i > 0 && i < N-1) { POLYBENCH_JACOBI_1D_BODY2; @@ -70,10 +74,10 @@ void POLYBENCH_JACOBI_1D::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); - poly_jacobi_1D_1<<>>(A, B, N); + poly_jacobi_1D_1<<>>(A, B, N); cudaErrchk( cudaGetLastError() ); - poly_jacobi_1D_2<<>>(A, B, N); + poly_jacobi_1D_2<<>>(A, B, N); cudaErrchk( cudaGetLastError() ); } diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp index ccdf47134..c3414561d 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp @@ -33,18 +33,22 @@ namespace polybench deallocHipDeviceData(B); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void poly_jacobi_1D_1(Real_ptr A, Real_ptr B, Index_type N) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i > 0 && i < N-1) { POLYBENCH_JACOBI_1D_BODY1; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void poly_jacobi_1D_2(Real_ptr A, Real_ptr B, Index_type N) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i > 0 && i < N-1) { POLYBENCH_JACOBI_1D_BODY2; @@ -70,11 +74,11 @@ void POLYBENCH_JACOBI_1D::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); - hipLaunchKernelGGL((poly_jacobi_1D_1), dim3(grid_size), dim3(block_size), 0, 0, + hipLaunchKernelGGL((poly_jacobi_1D_1), dim3(grid_size), dim3(block_size), 0, 0, A, B, N); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_jacobi_1D_2), dim3(grid_size), dim3(block_size), 0, 0, + hipLaunchKernelGGL((poly_jacobi_1D_2), dim3(grid_size), dim3(block_size), 0, 0, A, B, N); hipErrchk( hipGetLastError() ); diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp index d46d0ec22..ea85e2222 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp @@ -27,8 +27,11 @@ namespace polybench #define j_block_sz (32) #define i_block_sz (block_size / j_block_sz) +#define JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \ + j_block_sz, i_block_sz + #define JACOBI_2D_THREADS_PER_BLOCK_CUDA \ - dim3 nthreads_per_block(j_block_sz, i_block_sz, 1); + dim3 nthreads_per_block(JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA, 1); #define JACOBI_2D_NBLOCKS_CUDA \ dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(N-2, j_block_sz)), \ @@ -48,31 +51,36 @@ namespace polybench deallocCudaDeviceData(B); +template < size_t j_block_size, size_t i_block_size > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_jacobi_2D_1(Real_ptr A, Real_ptr B, Index_type N) { - Index_type i = 1 + blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = 1 + blockIdx.y * i_block_size + threadIdx.y; + Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x; if ( i < N-1 && j < N-1 ) { POLYBENCH_JACOBI_2D_BODY1; } } +template < size_t j_block_size, size_t i_block_size > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_jacobi_2D_2(Real_ptr A, Real_ptr B, Index_type N) { - Index_type i = 1 + blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = 1 + blockIdx.y * i_block_size + threadIdx.y; + Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x; if ( i < N-1 && j < N-1 ) { POLYBENCH_JACOBI_2D_BODY2; } } -template< typename Lambda > +template < size_t j_block_size, size_t i_block_size, typename Lambda > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_jacobi_2D_lam(Index_type N, Lambda body) { - Index_type i = 1 + blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = 1 + blockIdx.y * i_block_size + threadIdx.y; + Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x; if ( i < N-1 && j < N-1 ) { body(i, j); @@ -99,10 +107,12 @@ void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid) JACOBI_2D_THREADS_PER_BLOCK_CUDA; JACOBI_2D_NBLOCKS_CUDA; - poly_jacobi_2D_1<<>>(A, B, N); + poly_jacobi_2D_1 + <<>>(A, B, N); cudaErrchk( cudaGetLastError() ); - poly_jacobi_2D_2<<>>(A, B, N); + poly_jacobi_2D_2 + <<>>(A, B, N); cudaErrchk( cudaGetLastError() ); } @@ -124,14 +134,16 @@ void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid) JACOBI_2D_THREADS_PER_BLOCK_CUDA; JACOBI_2D_NBLOCKS_CUDA; - poly_jacobi_2D_lam<<>>(N, + poly_jacobi_2D_lam + <<>>(N, [=] __device__ (Index_type i, Index_type j) { POLYBENCH_JACOBI_2D_BODY1; } ); cudaErrchk( cudaGetLastError() ); - poly_jacobi_2D_lam<<>>(N, + poly_jacobi_2D_lam + <<>>(N, [=] __device__ (Index_type i, Index_type j) { POLYBENCH_JACOBI_2D_BODY2; } diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp index 8c0408b7e..ffa45f562 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp @@ -27,8 +27,11 @@ namespace polybench #define j_block_sz (32) #define i_block_sz (block_size / j_block_sz) +#define JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \ + j_block_sz, i_block_sz + #define JACOBI_2D_THREADS_PER_BLOCK_HIP \ - dim3 nthreads_per_block(j_block_sz, i_block_sz, 1); + dim3 nthreads_per_block(JACOBI_2D_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP, 1); #define JACOBI_2D_NBLOCKS_HIP \ dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(N-2, j_block_sz)), \ @@ -48,31 +51,36 @@ namespace polybench deallocHipDeviceData(B); +template < size_t j_block_size, size_t i_block_size > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_jacobi_2D_1(Real_ptr A, Real_ptr B, Index_type N) { - Index_type i = 1 + blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = 1 + blockIdx.y * i_block_size + threadIdx.y; + Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x; if ( i < N-1 && j < N-1 ) { POLYBENCH_JACOBI_2D_BODY1; } } +template < size_t j_block_size, size_t i_block_size > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_jacobi_2D_2(Real_ptr A, Real_ptr B, Index_type N) { - Index_type i = 1 + blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = 1 + blockIdx.y * i_block_size + threadIdx.y; + Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x; if ( i < N-1 && j < N-1 ) { POLYBENCH_JACOBI_2D_BODY2; } } -template< typename Lambda > +template < size_t j_block_size, size_t i_block_size, typename Lambda > +__launch_bounds__(j_block_size*i_block_size) __global__ void poly_jacobi_2D_lam(Index_type N, Lambda body) { - Index_type i = 1 + blockIdx.y * blockDim.y + threadIdx.y; - Index_type j = 1 + blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = 1 + blockIdx.y * i_block_size + threadIdx.y; + Index_type j = 1 + blockIdx.x * j_block_size + threadIdx.x; if ( i < N-1 && j < N-1 ) { body(i, j); @@ -99,12 +107,12 @@ void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid) JACOBI_2D_THREADS_PER_BLOCK_HIP; JACOBI_2D_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_jacobi_2D_1), + hipLaunchKernelGGL((poly_jacobi_2D_1), dim3(nblocks), dim3(nthreads_per_block), 0, 0, A, B, N); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_jacobi_2D_2), + hipLaunchKernelGGL((poly_jacobi_2D_2), dim3(nblocks), dim3(nthreads_per_block), 0, 0, A, B, N); hipErrchk( hipGetLastError() ); @@ -133,7 +141,7 @@ void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid) POLYBENCH_JACOBI_2D_BODY1; }; - hipLaunchKernelGGL((poly_jacobi_2D_lam), + hipLaunchKernelGGL((poly_jacobi_2D_lam), dim3(nblocks), dim3(nthreads_per_block), 0, 0, N, poly_jacobi_2D_1_lambda); hipErrchk( hipGetLastError() ); @@ -143,7 +151,7 @@ void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid) POLYBENCH_JACOBI_2D_BODY2; }; - hipLaunchKernelGGL((poly_jacobi_2D_lam), + hipLaunchKernelGGL((poly_jacobi_2D_lam), dim3(nblocks), dim3(nthreads_per_block), 0, 0, N, poly_jacobi_2D_2_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/polybench/POLYBENCH_MVT-Cuda.cpp b/src/polybench/POLYBENCH_MVT-Cuda.cpp index b8900c23c..169444a27 100644 --- a/src/polybench/POLYBENCH_MVT-Cuda.cpp +++ b/src/polybench/POLYBENCH_MVT-Cuda.cpp @@ -39,10 +39,12 @@ namespace polybench deallocCudaDeviceData(A); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void poly_mvt_1(Real_ptr A, Real_ptr x1, Real_ptr y1, Index_type N) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < N) { POLYBENCH_MVT_BODY1; @@ -53,10 +55,12 @@ __global__ void poly_mvt_1(Real_ptr A, Real_ptr x1, Real_ptr y1, } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void poly_mvt_2(Real_ptr A, Real_ptr x2, Real_ptr y2, Index_type N) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < N) { POLYBENCH_MVT_BODY4; @@ -84,10 +88,10 @@ void POLYBENCH_MVT::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); - poly_mvt_1<<>>(A, x1, y1, N); + poly_mvt_1<<>>(A, x1, y1, N); cudaErrchk( cudaGetLastError() ); - poly_mvt_2<<>>(A, x2, y2, N); + poly_mvt_2<<>>(A, x2, y2, N); cudaErrchk( cudaGetLastError() ); } diff --git a/src/polybench/POLYBENCH_MVT-Hip.cpp b/src/polybench/POLYBENCH_MVT-Hip.cpp index 4412c0966..37d92a69f 100644 --- a/src/polybench/POLYBENCH_MVT-Hip.cpp +++ b/src/polybench/POLYBENCH_MVT-Hip.cpp @@ -39,10 +39,12 @@ namespace polybench deallocHipDeviceData(A); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void poly_mvt_1(Real_ptr A, Real_ptr x1, Real_ptr y1, Index_type N) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < N) { POLYBENCH_MVT_BODY1; @@ -53,10 +55,12 @@ __global__ void poly_mvt_1(Real_ptr A, Real_ptr x1, Real_ptr y1, } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void poly_mvt_2(Real_ptr A, Real_ptr x2, Real_ptr y2, Index_type N) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < N) { POLYBENCH_MVT_BODY4; @@ -84,12 +88,12 @@ void POLYBENCH_MVT::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); - hipLaunchKernelGGL((poly_mvt_1), + hipLaunchKernelGGL((poly_mvt_1), dim3(grid_size), dim3(block_size), 0, 0, A, x1, y1, N); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_mvt_2), + hipLaunchKernelGGL((poly_mvt_2), dim3(grid_size), dim3(block_size), 0, 0, A, x2, y2, N); hipErrchk( hipGetLastError() ); From 5bf2e240de580edad82e8f874b1e53e43acdc9ff Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 25 Oct 2021 13:15:28 -0700 Subject: [PATCH 150/392] Use per kernel default block size instead of first Use the default block size for each kernel if available instead of the first in the list of block sizes --- src/apps/DEL_DOT_VEC_2D.cpp | 2 +- src/apps/ENERGY.cpp | 2 +- src/apps/FIR.cpp | 2 +- src/apps/HALOEXCHANGE.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED.cpp | 2 +- src/apps/LTIMES.cpp | 2 +- src/apps/LTIMES_NOVIEW.cpp | 2 +- src/apps/PRESSURE.cpp | 2 +- src/apps/VOL3D.cpp | 2 +- src/basic/DAXPY.cpp | 2 +- src/basic/IF_QUAD.cpp | 2 +- src/basic/INIT3.cpp | 2 +- src/basic/INIT_VIEW1D.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET.cpp | 2 +- src/basic/MULADDSUB.cpp | 2 +- src/basic/NESTED_INIT.cpp | 2 +- src/basic/PI_ATOMIC.cpp | 2 +- src/basic/PI_REDUCE.cpp | 2 +- src/basic/REDUCE3_INT.cpp | 2 +- src/basic/TRAP_INT.cpp | 2 +- src/common/GPUUtils.hpp | 7 +++++-- src/lcals/DIFF_PREDICT.cpp | 2 +- src/lcals/EOS.cpp | 2 +- src/lcals/FIRST_DIFF.cpp | 2 +- src/lcals/FIRST_MIN.cpp | 2 +- src/lcals/FIRST_SUM.cpp | 2 +- src/lcals/GEN_LIN_RECUR.cpp | 2 +- src/lcals/HYDRO_1D.cpp | 2 +- src/lcals/HYDRO_2D.cpp | 2 +- src/lcals/INT_PREDICT.cpp | 2 +- src/lcals/PLANCKIAN.cpp | 2 +- src/lcals/TRIDIAG_ELIM.cpp | 2 +- src/polybench/POLYBENCH_2MM.cpp | 2 +- src/polybench/POLYBENCH_3MM.cpp | 2 +- src/polybench/POLYBENCH_ADI.cpp | 2 +- src/polybench/POLYBENCH_ATAX.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D.cpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp | 2 +- src/polybench/POLYBENCH_GEMM.cpp | 2 +- src/polybench/POLYBENCH_GEMVER.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D.cpp | 2 +- src/polybench/POLYBENCH_MVT.cpp | 2 +- src/stream/ADD.cpp | 2 +- src/stream/COPY.cpp | 2 +- src/stream/DOT.cpp | 2 +- src/stream/MUL.cpp | 2 +- src/stream/TRIAD.cpp | 2 +- 50 files changed, 54 insertions(+), 51 deletions(-) diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index 992482b06..2c332f7ad 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -25,7 +25,7 @@ namespace apps DEL_DOT_VEC_2D::DEL_DOT_VEC_2D(const RunParams& params) : KernelBase(rajaperf::Apps_DEL_DOT_VEC_2D, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp index 82ba4918e..7d481f1ca 100644 --- a/src/apps/ENERGY.cpp +++ b/src/apps/ENERGY.cpp @@ -21,7 +21,7 @@ namespace apps ENERGY::ENERGY(const RunParams& params) : KernelBase(rajaperf::Apps_ENERGY, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp index f7976b0f0..d28e3c587 100644 --- a/src/apps/FIR.cpp +++ b/src/apps/FIR.cpp @@ -21,7 +21,7 @@ namespace apps FIR::FIR(const RunParams& params) : KernelBase(rajaperf::Apps_FIR, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/apps/HALOEXCHANGE.cpp b/src/apps/HALOEXCHANGE.cpp index 55109ad65..be1f12a3b 100644 --- a/src/apps/HALOEXCHANGE.cpp +++ b/src/apps/HALOEXCHANGE.cpp @@ -44,7 +44,7 @@ void destroy_unpack_lists(std::vector& unpack_index_lists, HALOEXCHANGE::HALOEXCHANGE(const RunParams& params) : KernelBase(rajaperf::Apps_HALOEXCHANGE, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/apps/HALOEXCHANGE_FUSED.cpp b/src/apps/HALOEXCHANGE_FUSED.cpp index 71ad45b49..89fd84a38 100644 --- a/src/apps/HALOEXCHANGE_FUSED.cpp +++ b/src/apps/HALOEXCHANGE_FUSED.cpp @@ -44,7 +44,7 @@ void destroy_unpack_lists(std::vector& unpack_index_lists, HALOEXCHANGE_FUSED::HALOEXCHANGE_FUSED(const RunParams& params) : KernelBase(rajaperf::Apps_HALOEXCHANGE_FUSED, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp index 886c7f24e..3ef94b4fd 100644 --- a/src/apps/LTIMES.cpp +++ b/src/apps/LTIMES.cpp @@ -23,7 +23,7 @@ namespace apps LTIMES::LTIMES(const RunParams& params) : KernelBase(rajaperf::Apps_LTIMES, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp index da8d7e9e6..9d3d000e3 100644 --- a/src/apps/LTIMES_NOVIEW.cpp +++ b/src/apps/LTIMES_NOVIEW.cpp @@ -23,7 +23,7 @@ namespace apps LTIMES_NOVIEW::LTIMES_NOVIEW(const RunParams& params) : KernelBase(rajaperf::Apps_LTIMES_NOVIEW, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp index 6793fb7fb..3d6e34372 100644 --- a/src/apps/PRESSURE.cpp +++ b/src/apps/PRESSURE.cpp @@ -21,7 +21,7 @@ namespace apps PRESSURE::PRESSURE(const RunParams& params) : KernelBase(rajaperf::Apps_PRESSURE, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp index df532ef70..db2c59b32 100644 --- a/src/apps/VOL3D.cpp +++ b/src/apps/VOL3D.cpp @@ -25,7 +25,7 @@ namespace apps VOL3D::VOL3D(const RunParams& params) : KernelBase(rajaperf::Apps_VOL3D, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp index 06ad393eb..b04c70d23 100644 --- a/src/basic/DAXPY.cpp +++ b/src/basic/DAXPY.cpp @@ -21,7 +21,7 @@ namespace basic DAXPY::DAXPY(const RunParams& params) : KernelBase(rajaperf::Basic_DAXPY, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp index d8a016375..ec1c81fc3 100644 --- a/src/basic/IF_QUAD.cpp +++ b/src/basic/IF_QUAD.cpp @@ -21,7 +21,7 @@ namespace basic IF_QUAD::IF_QUAD(const RunParams& params) : KernelBase(rajaperf::Basic_IF_QUAD, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp index 94be0f796..9cc6bf063 100644 --- a/src/basic/INIT3.cpp +++ b/src/basic/INIT3.cpp @@ -21,7 +21,7 @@ namespace basic INIT3::INIT3(const RunParams& params) : KernelBase(rajaperf::Basic_INIT3, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp index ab59d0c8b..f6e257058 100644 --- a/src/basic/INIT_VIEW1D.cpp +++ b/src/basic/INIT_VIEW1D.cpp @@ -21,7 +21,7 @@ namespace basic INIT_VIEW1D::INIT_VIEW1D(const RunParams& params) : KernelBase(rajaperf::Basic_INIT_VIEW1D, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp index fb53cbcdb..74f4e68be 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET.cpp @@ -21,7 +21,7 @@ namespace basic INIT_VIEW1D_OFFSET::INIT_VIEW1D_OFFSET(const RunParams& params) : KernelBase(rajaperf::Basic_INIT_VIEW1D_OFFSET, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp index 41f6c42f9..7431a634c 100644 --- a/src/basic/MULADDSUB.cpp +++ b/src/basic/MULADDSUB.cpp @@ -21,7 +21,7 @@ namespace basic MULADDSUB::MULADDSUB(const RunParams& params) : KernelBase(rajaperf::Basic_MULADDSUB, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp index 5dff01b8a..472242480 100644 --- a/src/basic/NESTED_INIT.cpp +++ b/src/basic/NESTED_INIT.cpp @@ -24,7 +24,7 @@ namespace basic NESTED_INIT::NESTED_INIT(const RunParams& params) : KernelBase(rajaperf::Basic_NESTED_INIT, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp index 019664280..f4e52ec8f 100644 --- a/src/basic/PI_ATOMIC.cpp +++ b/src/basic/PI_ATOMIC.cpp @@ -21,7 +21,7 @@ namespace basic PI_ATOMIC::PI_ATOMIC(const RunParams& params) : KernelBase(rajaperf::Basic_PI_ATOMIC, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/basic/PI_REDUCE.cpp b/src/basic/PI_REDUCE.cpp index 7e31227a7..af0ded096 100644 --- a/src/basic/PI_REDUCE.cpp +++ b/src/basic/PI_REDUCE.cpp @@ -21,7 +21,7 @@ namespace basic PI_REDUCE::PI_REDUCE(const RunParams& params) : KernelBase(rajaperf::Basic_PI_REDUCE, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp index 53eb4670c..7b6be51ce 100644 --- a/src/basic/REDUCE3_INT.cpp +++ b/src/basic/REDUCE3_INT.cpp @@ -23,7 +23,7 @@ namespace basic REDUCE3_INT::REDUCE3_INT(const RunParams& params) : KernelBase(rajaperf::Basic_REDUCE3_INT, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp index 5db4f8cd1..b6c31aada 100644 --- a/src/basic/TRAP_INT.cpp +++ b/src/basic/TRAP_INT.cpp @@ -21,7 +21,7 @@ namespace basic TRAP_INT::TRAP_INT(const RunParams& params) : KernelBase(rajaperf::Basic_TRAP_INT, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 69d933e5f..07eff4e8f 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -123,10 +123,13 @@ struct RunHipBlockSize VariantID m_vid; }; -// return the first integer in the int_seq +// return default_I if it is in sizes or the first integer in sizes otherwise template < size_t I, size_t... Is > -size_t get_first(camp::int_seq) +size_t get_default_or_first(size_t default_I, camp::int_seq sizes) { + if (invoke_or(Equals(default_I), sizes)) { + return default_I; + } return I; } diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp index 42313cda0..2f13f36cb 100644 --- a/src/lcals/DIFF_PREDICT.cpp +++ b/src/lcals/DIFF_PREDICT.cpp @@ -20,7 +20,7 @@ namespace lcals DIFF_PREDICT::DIFF_PREDICT(const RunParams& params) : KernelBase(rajaperf::Lcals_DIFF_PREDICT, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp index abd555e89..033a1a4fb 100644 --- a/src/lcals/EOS.cpp +++ b/src/lcals/EOS.cpp @@ -21,7 +21,7 @@ namespace lcals EOS::EOS(const RunParams& params) : KernelBase(rajaperf::Lcals_EOS, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp index aad34c42b..f9a62b683 100644 --- a/src/lcals/FIRST_DIFF.cpp +++ b/src/lcals/FIRST_DIFF.cpp @@ -21,7 +21,7 @@ namespace lcals FIRST_DIFF::FIRST_DIFF(const RunParams& params) : KernelBase(rajaperf::Lcals_FIRST_DIFF, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp index 3546853a7..42bdae851 100644 --- a/src/lcals/FIRST_MIN.cpp +++ b/src/lcals/FIRST_MIN.cpp @@ -21,7 +21,7 @@ namespace lcals FIRST_MIN::FIRST_MIN(const RunParams& params) : KernelBase(rajaperf::Lcals_FIRST_MIN, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp index 38eff5416..46b2817f6 100644 --- a/src/lcals/FIRST_SUM.cpp +++ b/src/lcals/FIRST_SUM.cpp @@ -21,7 +21,7 @@ namespace lcals FIRST_SUM::FIRST_SUM(const RunParams& params) : KernelBase(rajaperf::Lcals_FIRST_SUM, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/lcals/GEN_LIN_RECUR.cpp b/src/lcals/GEN_LIN_RECUR.cpp index 215bd3389..9ef868efb 100644 --- a/src/lcals/GEN_LIN_RECUR.cpp +++ b/src/lcals/GEN_LIN_RECUR.cpp @@ -21,7 +21,7 @@ namespace lcals GEN_LIN_RECUR::GEN_LIN_RECUR(const RunParams& params) : KernelBase(rajaperf::Lcals_GEN_LIN_RECUR, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp index faf27fc7e..6b757ab57 100644 --- a/src/lcals/HYDRO_1D.cpp +++ b/src/lcals/HYDRO_1D.cpp @@ -21,7 +21,7 @@ namespace lcals HYDRO_1D::HYDRO_1D(const RunParams& params) : KernelBase(rajaperf::Lcals_HYDRO_1D, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp index f1b2347f8..8f3ccc37a 100644 --- a/src/lcals/HYDRO_2D.cpp +++ b/src/lcals/HYDRO_2D.cpp @@ -24,7 +24,7 @@ namespace lcals HYDRO_2D::HYDRO_2D(const RunParams& params) : KernelBase(rajaperf::Lcals_HYDRO_2D, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp index 898866e90..d828a6cf3 100644 --- a/src/lcals/INT_PREDICT.cpp +++ b/src/lcals/INT_PREDICT.cpp @@ -21,7 +21,7 @@ namespace lcals INT_PREDICT::INT_PREDICT(const RunParams& params) : KernelBase(rajaperf::Lcals_INT_PREDICT, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp index 683eec841..84171c2b4 100644 --- a/src/lcals/PLANCKIAN.cpp +++ b/src/lcals/PLANCKIAN.cpp @@ -21,7 +21,7 @@ namespace lcals PLANCKIAN::PLANCKIAN(const RunParams& params) : KernelBase(rajaperf::Lcals_PLANCKIAN, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp index 872e15703..9b146904e 100644 --- a/src/lcals/TRIDIAG_ELIM.cpp +++ b/src/lcals/TRIDIAG_ELIM.cpp @@ -21,7 +21,7 @@ namespace lcals TRIDIAG_ELIM::TRIDIAG_ELIM(const RunParams& params) : KernelBase(rajaperf::Lcals_TRIDIAG_ELIM, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index 367de1871..12659b42a 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -22,7 +22,7 @@ namespace polybench POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params) : KernelBase(rajaperf::Polybench_2MM, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp index 10bf9e6fb..b3e12c4a3 100644 --- a/src/polybench/POLYBENCH_3MM.cpp +++ b/src/polybench/POLYBENCH_3MM.cpp @@ -23,7 +23,7 @@ namespace polybench POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params) : KernelBase(rajaperf::Polybench_3MM, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp index 628c609ab..27a82550a 100644 --- a/src/polybench/POLYBENCH_ADI.cpp +++ b/src/polybench/POLYBENCH_ADI.cpp @@ -20,7 +20,7 @@ namespace polybench POLYBENCH_ADI::POLYBENCH_ADI(const RunParams& params) : KernelBase(rajaperf::Polybench_ADI, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp index de9e46fb8..525457d5e 100644 --- a/src/polybench/POLYBENCH_ATAX.cpp +++ b/src/polybench/POLYBENCH_ATAX.cpp @@ -21,7 +21,7 @@ namespace polybench POLYBENCH_ATAX::POLYBENCH_ATAX(const RunParams& params) : KernelBase(rajaperf::Polybench_ATAX, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp index 4eed13c96..0b0c86747 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D.cpp @@ -24,7 +24,7 @@ namespace polybench POLYBENCH_FDTD_2D::POLYBENCH_FDTD_2D(const RunParams& params) : KernelBase(rajaperf::Polybench_FDTD_2D, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp index 27991d45b..e0cdd45bd 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp @@ -21,7 +21,7 @@ namespace polybench POLYBENCH_FLOYD_WARSHALL::POLYBENCH_FLOYD_WARSHALL(const RunParams& params) : KernelBase(rajaperf::Polybench_FLOYD_WARSHALL, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp index 4376ae8eb..8fc1ebc4c 100644 --- a/src/polybench/POLYBENCH_GEMM.cpp +++ b/src/polybench/POLYBENCH_GEMM.cpp @@ -21,7 +21,7 @@ namespace polybench POLYBENCH_GEMM::POLYBENCH_GEMM(const RunParams& params) : KernelBase(rajaperf::Polybench_GEMM, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp index 817946cb6..cdcc003c8 100644 --- a/src/polybench/POLYBENCH_GEMVER.cpp +++ b/src/polybench/POLYBENCH_GEMVER.cpp @@ -21,7 +21,7 @@ namespace polybench POLYBENCH_GEMVER::POLYBENCH_GEMVER(const RunParams& params) : KernelBase(rajaperf::Polybench_GEMVER, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp index deef5412e..5aace89ce 100644 --- a/src/polybench/POLYBENCH_GESUMMV.cpp +++ b/src/polybench/POLYBENCH_GESUMMV.cpp @@ -21,7 +21,7 @@ namespace polybench POLYBENCH_GESUMMV::POLYBENCH_GESUMMV(const RunParams& params) : KernelBase(rajaperf::Polybench_GESUMMV, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp index 126ec4ef7..4d74c5b5c 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D.cpp @@ -22,7 +22,7 @@ namespace polybench POLYBENCH_HEAT_3D::POLYBENCH_HEAT_3D(const RunParams& params) : KernelBase(rajaperf::Polybench_HEAT_3D, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp index 88204d1e1..1a4b5fbe4 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp @@ -21,7 +21,7 @@ namespace polybench POLYBENCH_JACOBI_1D::POLYBENCH_JACOBI_1D(const RunParams& params) : KernelBase(rajaperf::Polybench_JACOBI_1D, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp index 1e9b5e396..68c6a348c 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp @@ -21,7 +21,7 @@ namespace polybench POLYBENCH_JACOBI_2D::POLYBENCH_JACOBI_2D(const RunParams& params) : KernelBase(rajaperf::Polybench_JACOBI_2D, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp index 51ca483d5..de0b28a2b 100644 --- a/src/polybench/POLYBENCH_MVT.cpp +++ b/src/polybench/POLYBENCH_MVT.cpp @@ -21,7 +21,7 @@ namespace polybench POLYBENCH_MVT::POLYBENCH_MVT(const RunParams& params) : KernelBase(rajaperf::Polybench_MVT, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp index 5a58f976f..da7e1b443 100644 --- a/src/stream/ADD.cpp +++ b/src/stream/ADD.cpp @@ -21,7 +21,7 @@ namespace stream ADD::ADD(const RunParams& params) : KernelBase(rajaperf::Stream_ADD, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp index 6e05b68db..737642ca4 100644 --- a/src/stream/COPY.cpp +++ b/src/stream/COPY.cpp @@ -21,7 +21,7 @@ namespace stream COPY::COPY(const RunParams& params) : KernelBase(rajaperf::Stream_COPY, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp index 874e16477..2e0a29b1a 100644 --- a/src/stream/DOT.cpp +++ b/src/stream/DOT.cpp @@ -21,7 +21,7 @@ namespace stream DOT::DOT(const RunParams& params) : KernelBase(rajaperf::Stream_DOT, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp index d6e4b5404..e0c94dd8f 100644 --- a/src/stream/MUL.cpp +++ b/src/stream/MUL.cpp @@ -21,7 +21,7 @@ namespace stream MUL::MUL(const RunParams& params) : KernelBase(rajaperf::Stream_MUL, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp index 4d9a4aece..dd0ae17fa 100644 --- a/src/stream/TRIAD.cpp +++ b/src/stream/TRIAD.cpp @@ -21,7 +21,7 @@ namespace stream TRIAD::TRIAD(const RunParams& params) : KernelBase(rajaperf::Stream_TRIAD, params) { - setDefaultGPUBlockSize( gpu_block_size::get_first(gpu_block_sizes_type()) ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); From 37e7a0e87d5bb70329e28c8150f1ff701be91c09 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 25 Oct 2021 13:16:57 -0700 Subject: [PATCH 151/392] Add minimal block size support to algorigm kernels The do not support multiple block sizes, but they will no longer run if the user specifies a non-default block size --- src/algorithm/SORT.cpp | 4 ++++ src/algorithm/SORT.hpp | 2 ++ src/algorithm/SORTPAIRS.cpp | 4 ++++ src/algorithm/SORTPAIRS.hpp | 2 ++ 4 files changed, 12 insertions(+) diff --git a/src/algorithm/SORT.cpp b/src/algorithm/SORT.cpp index d9d659482..db28e8cc4 100644 --- a/src/algorithm/SORT.cpp +++ b/src/algorithm/SORT.cpp @@ -21,6 +21,10 @@ namespace algorithm SORT::SORT(const RunParams& params) : KernelBase(rajaperf::Algorithm_SORT, params) { + setDefaultGPUBlockSize( default_gpu_block_size ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(20); diff --git a/src/algorithm/SORT.hpp b/src/algorithm/SORT.hpp index f576bee97..690f7b778 100644 --- a/src/algorithm/SORT.hpp +++ b/src/algorithm/SORT.hpp @@ -56,6 +56,8 @@ class SORT : public KernelBase } private: + static const size_t default_gpu_block_size = 0; + Real_ptr m_x; }; diff --git a/src/algorithm/SORTPAIRS.cpp b/src/algorithm/SORTPAIRS.cpp index 7f2e59cbb..a4e7eabba 100644 --- a/src/algorithm/SORTPAIRS.cpp +++ b/src/algorithm/SORTPAIRS.cpp @@ -21,6 +21,10 @@ namespace algorithm SORTPAIRS::SORTPAIRS(const RunParams& params) : KernelBase(rajaperf::Algorithm_SORTPAIRS, params) { + setDefaultGPUBlockSize( default_gpu_block_size ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + setDefaultProblemSize(1000000); setDefaultReps(20); diff --git a/src/algorithm/SORTPAIRS.hpp b/src/algorithm/SORTPAIRS.hpp index b6f03005f..5e1c44d7c 100644 --- a/src/algorithm/SORTPAIRS.hpp +++ b/src/algorithm/SORTPAIRS.hpp @@ -55,6 +55,8 @@ class SORTPAIRS : public KernelBase } private: + static const size_t default_gpu_block_size = 0; + Real_ptr m_x; Real_ptr m_i; }; From 8ccccdfecb0a98afcf819d9b19dfb11e30c2dc11 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 25 Oct 2021 13:18:01 -0700 Subject: [PATCH 152/392] Add minimal block size support in teams kernels Just enough to have a non-zero block size and only run if the right block size is requested by the user. --- src/apps/DIFFUSION3DPA.cpp | 4 ++++ src/apps/DIFFUSION3DPA.hpp | 1 + src/apps/MASS3DPA.cpp | 4 ++++ src/apps/MASS3DPA.hpp | 1 + src/basic/MAT_MAT_SHARED.cpp | 5 ++++- src/basic/MAT_MAT_SHARED.hpp | 2 ++ 6 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp index baf892576..a73e4bed0 100644 --- a/src/apps/DIFFUSION3DPA.cpp +++ b/src/apps/DIFFUSION3DPA.cpp @@ -23,6 +23,10 @@ namespace apps DIFFUSION3DPA::DIFFUSION3DPA(const RunParams& params) : KernelBase(rajaperf::Apps_DIFFUSION3DPA, params) { + setDefaultGPUBlockSize( default_gpu_block_size ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + m_NE_default = 15625; setDefaultProblemSize(m_NE_default*DPA_Q1D*DPA_Q1D*DPA_Q1D); diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index b7a2b1e1d..68ce2168d 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -632,6 +632,7 @@ class DIFFUSION3DPA : public KernelBase void runOpenMPTargetVariant(VariantID vid); private: + static const size_t default_gpu_block_size = DPA_Q1D * DPA_Q1D; Real_ptr m_B; Real_ptr m_Bt; diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp index 9ede00404..70f21f9e1 100644 --- a/src/apps/MASS3DPA.cpp +++ b/src/apps/MASS3DPA.cpp @@ -23,6 +23,10 @@ namespace apps MASS3DPA::MASS3DPA(const RunParams& params) : KernelBase(rajaperf::Apps_MASS3DPA, params) { + setDefaultGPUBlockSize( default_gpu_block_size ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); + m_NE_default = 8000; setDefaultProblemSize(m_NE_default*MPA_Q1D*MPA_Q1D*MPA_Q1D); diff --git a/src/apps/MASS3DPA.hpp b/src/apps/MASS3DPA.hpp index 76959f33b..c94b19fbe 100644 --- a/src/apps/MASS3DPA.hpp +++ b/src/apps/MASS3DPA.hpp @@ -379,6 +379,7 @@ class MASS3DPA : public KernelBase void runOpenMPTargetVariant(VariantID vid); private: + static const size_t default_gpu_block_size = MPA_Q1D * MPA_Q1D; Real_ptr m_B; Real_ptr m_Bt; diff --git a/src/basic/MAT_MAT_SHARED.cpp b/src/basic/MAT_MAT_SHARED.cpp index dd21012e6..79cd2b49c 100644 --- a/src/basic/MAT_MAT_SHARED.cpp +++ b/src/basic/MAT_MAT_SHARED.cpp @@ -18,8 +18,11 @@ namespace rajaperf { namespace basic { MAT_MAT_SHARED::MAT_MAT_SHARED(const RunParams ¶ms) - : KernelBase(rajaperf::Basic_MAT_MAT_SHARED, params) + : KernelBase(rajaperf::Basic_MAT_MAT_SHARED, params) { + setDefaultGPUBlockSize( default_gpu_block_size ); + setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() + : getDefaultGPUBlockSize() ); m_N_default = 1000; setDefaultProblemSize(m_N_default*m_N_default); diff --git a/src/basic/MAT_MAT_SHARED.hpp b/src/basic/MAT_MAT_SHARED.hpp index 95b799eb8..50012da23 100644 --- a/src/basic/MAT_MAT_SHARED.hpp +++ b/src/basic/MAT_MAT_SHARED.hpp @@ -156,6 +156,8 @@ class MAT_MAT_SHARED : public KernelBase { void runOpenMPTargetVariant(VariantID vid); private: + static const size_t default_gpu_block_size = TL_SZ * TL_SZ; + Real_ptr m_A; Real_ptr m_B; Real_ptr m_C; From 3a94d28e08f66ee6ed0181c0872d4c58992397b2 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 25 Oct 2021 13:29:10 -0700 Subject: [PATCH 153/392] Use launch bounds in teams kernels --- src/apps/DIFFUSION3DPA-Cuda.cpp | 6 ++++-- src/apps/DIFFUSION3DPA-Hip.cpp | 6 ++++-- src/apps/MASS3DPA-Cuda.cpp | 10 ++++++---- src/apps/MASS3DPA-Hip.cpp | 8 +++++--- src/basic/MAT_MAT_SHARED-Cuda.cpp | 28 +++++++++++++++------------- src/basic/MAT_MAT_SHARED-Hip.cpp | 12 +++++++----- 6 files changed, 41 insertions(+), 29 deletions(-) diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index 61b0d0798..4c3daabcc 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -37,6 +37,8 @@ namespace apps { deallocCudaDeviceData(X); \ deallocCudaDeviceData(Y); +template < size_t block_size > + __launch_bounds__(block_size) __global__ void Diffusion3DPA(Index_type NE, const Real_ptr Basis, const Real_ptr dBasis, const Real_ptr D, const Real_ptr X, Real_ptr Y, bool symmetric) { @@ -114,7 +116,7 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, 1); - Diffusion3DPA<<>>(NE, Basis, dBasis, D, X, Y, symmetric); + Diffusion3DPA<<>>(NE, Basis, dBasis, D, X, Y, symmetric); cudaErrchk( cudaGetLastError() ); } @@ -130,7 +132,7 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { DIFFUSION3DPA_DATA_SETUP_CUDA; using launch_policy = RAJA::expt::LaunchPolicy + ,RAJA::expt::cuda_launch_t >; using outer_x = RAJA::expt::LoopPolicy + __launch_bounds__(block_size) __global__ void Diffusion3DPA(Index_type NE, const Real_ptr Basis, const Real_ptr dBasis, const Real_ptr D, const Real_ptr X, Real_ptr Y, bool symmetric) { @@ -115,7 +117,7 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipLaunchKernelGGL((Diffusion3DPA), dim3(grid_size), dim3(block_size), 0, 0, + hipLaunchKernelGGL((Diffusion3DPA), dim3(grid_size), dim3(block_size), 0, 0, NE, Basis, dBasis, D, X, Y, symmetric); hipErrchk( hipGetLastError() ); @@ -133,7 +135,7 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { DIFFUSION3DPA_DATA_SETUP_HIP; using launch_policy = RAJA::expt::LaunchPolicy + ,RAJA::expt::hip_launch_t >; using outer_x = RAJA::expt::LoopPolicy + __launch_bounds__(block_size) __global__ void Mass3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt, const Real_ptr D, const Real_ptr X, Real_ptr Y) { @@ -116,7 +118,7 @@ void MASS3DPA::runCudaVariant(VariantID vid) { dim3 nthreads_per_block(MPA_Q1D, MPA_Q1D, 1); - Mass3DPA<<>>(NE, B, Bt, D, X, Y); + Mass3DPA<<>>(NE, B, Bt, D, X, Y); cudaErrchk( cudaGetLastError() ); } @@ -132,7 +134,7 @@ void MASS3DPA::runCudaVariant(VariantID vid) { MASS3DPA_DATA_SETUP_CUDA; using launch_policy = RAJA::expt::LaunchPolicy + ,RAJA::expt::cuda_launch_t >; using outer_x = RAJA::expt::LoopPolicy + ); // RAJA::expt::loop } ); // RAJA::expt::loop @@ -233,7 +235,7 @@ void MASS3DPA::runCudaVariant(VariantID vid) { [&](int dx) { MASS3DPA_7 } - ); // RAJA::expt::loop + ); // RAJA::expt::loop } ); // RAJA::expt::loop diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index 12caf28c1..84c384c1c 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -37,6 +37,8 @@ namespace apps { deallocHipDeviceData(X); \ deallocHipDeviceData(Y); +template < size_t block_size > + __launch_bounds__(block_size) __global__ void Mass3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt, const Real_ptr D, const Real_ptr X, Real_ptr Y) { @@ -117,7 +119,7 @@ void MASS3DPA::runHipVariant(VariantID vid) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipLaunchKernelGGL((Mass3DPA), dim3(grid_size), dim3(block_size), 0, 0, + hipLaunchKernelGGL((Mass3DPA), dim3(grid_size), dim3(block_size), 0, 0, NE, B, Bt, D, X, Y); hipErrchk( hipGetLastError() ); @@ -135,7 +137,7 @@ void MASS3DPA::runHipVariant(VariantID vid) { MASS3DPA_DATA_SETUP_HIP; using launch_policy = RAJA::expt::LaunchPolicy + ,RAJA::expt::hip_launch_t >; using outer_x = RAJA::expt::LoopPolicy + ); // RAJA::expt::loop RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int dx) { diff --git a/src/basic/MAT_MAT_SHARED-Cuda.cpp b/src/basic/MAT_MAT_SHARED-Cuda.cpp index a208c3692..f8c13481e 100644 --- a/src/basic/MAT_MAT_SHARED-Cuda.cpp +++ b/src/basic/MAT_MAT_SHARED-Cuda.cpp @@ -33,6 +33,8 @@ namespace basic { deallocCudaDeviceData(B); \ deallocCudaDeviceData(C); +template < size_t block_size > + __launch_bounds__(block_size) __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A, Real_ptr B) { @@ -80,7 +82,7 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - mat_mat_shared<<>>(N, C, A, B); + mat_mat_shared<<>>(N, C, A, B); cudaErrchk( cudaGetLastError() ); } @@ -95,7 +97,7 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - lambda_cuda<<>>([=] __device__() { + lambda_cuda<<>>([=] __device__() { auto outer_y = [&](Index_type by) { auto outer_x = [&](Index_type bx) { MAT_MAT_SHARED_BODY_0 @@ -195,7 +197,7 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) { MAT_MAT_SHARED_DATA_SETUP_CUDA; using launch_policy = RAJA::expt::LaunchPolicy + ,RAJA::expt::cuda_launch_t >; using teams_x = RAJA::expt::LoopPolicy(ctx, RAJA::RangeSegment(0, Ny), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Ny), [&](Index_type by) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Nx), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Nx), [&](Index_type bx) { MAT_MAT_SHARED_BODY_0 - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type ty) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { MAT_MAT_SHARED_BODY_1 } ); // RAJA::expt::loop - } + } ); // RAJA::expt::loop for (Index_type k = 0; k < (TL_SZ + N - 1) / TL_SZ; k++) { - + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type ty) { - RAJA::expt::loop(ctx, - RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, + RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { MAT_MAT_SHARED_BODY_2 } @@ -258,7 +260,7 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type ty) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { MAT_MAT_SHARED_BODY_3 } @@ -270,7 +272,7 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) { } // for (k) - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type ty) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { diff --git a/src/basic/MAT_MAT_SHARED-Hip.cpp b/src/basic/MAT_MAT_SHARED-Hip.cpp index d4ea505e5..a13e53a51 100644 --- a/src/basic/MAT_MAT_SHARED-Hip.cpp +++ b/src/basic/MAT_MAT_SHARED-Hip.cpp @@ -33,6 +33,8 @@ namespace basic { deallocHipDeviceData(B); \ deallocHipDeviceData(C); +template < size_t block_size > + __launch_bounds__(block_size) __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A, Real_ptr B) { @@ -80,7 +82,7 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipLaunchKernelGGL((mat_mat_shared), dim3(grid_size), dim3(block_size), 0, 0, + hipLaunchKernelGGL((mat_mat_shared), dim3(grid_size), dim3(block_size), 0, 0, N, C, A, B); hipErrchk( hipGetLastError() ); @@ -186,7 +188,7 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) { } }; - hipLaunchKernelGGL(lambda_hip, + hipLaunchKernelGGL(lambda_hip, grid_size, block_size, 0, 0, mat_mat_shared_lam); hipErrchk( hipGetLastError() ); @@ -200,7 +202,7 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) { MAT_MAT_SHARED_DATA_SETUP_HIP; using launch_policy = RAJA::expt::LaunchPolicy + ,RAJA::expt::hip_launch_t >; using teams_x = RAJA::expt::LoopPolicy(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { MAT_MAT_SHARED_BODY_3 - } + } ); // RAJA::expt::loop } ); // RAJA::expt::loop ctx.teamSync(); - + } // for (k) RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), From ffdce3c8cff28252b55dc0a5c3b828b491b2f83c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 25 Oct 2021 15:42:35 -0700 Subject: [PATCH 154/392] Enable multiple block sizes in MAT_MAT_SHARED Add ability to remove invalid block_sizes from gpu_block_sizes_type. This allows the perf suite to compile with block_sizes that are invalid for some kernels but valid for others. --- src/basic/MAT_MAT_SHARED-Cuda.cpp | 105 ++++++++++++++++------------- src/basic/MAT_MAT_SHARED-Hip.cpp | 107 +++++++++++++++++------------- src/basic/MAT_MAT_SHARED-OMP.cpp | 40 +++++------ src/basic/MAT_MAT_SHARED-Seq.cpp | 38 +++++------ src/basic/MAT_MAT_SHARED.cpp | 8 ++- src/basic/MAT_MAT_SHARED.hpp | 56 +++++++++------- src/common/GPUUtils.hpp | 97 +++++++++++++++++++++++++-- 7 files changed, 288 insertions(+), 163 deletions(-) diff --git a/src/basic/MAT_MAT_SHARED-Cuda.cpp b/src/basic/MAT_MAT_SHARED-Cuda.cpp index f8c13481e..9545683a2 100644 --- a/src/basic/MAT_MAT_SHARED-Cuda.cpp +++ b/src/basic/MAT_MAT_SHARED-Cuda.cpp @@ -33,8 +33,8 @@ namespace basic { deallocCudaDeviceData(B); \ deallocCudaDeviceData(C); -template < size_t block_size > - __launch_bounds__(block_size) +template < size_t tile_size > + __launch_bounds__(tile_size*tile_size) __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A, Real_ptr B) { @@ -43,35 +43,39 @@ __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A, Index_type bx = blockIdx.x; Index_type by = blockIdx.y; - MAT_MAT_SHARED_BODY_0 + MAT_MAT_SHARED_BODY_0(tile_size) - MAT_MAT_SHARED_BODY_1 + MAT_MAT_SHARED_BODY_1(tile_size) - for (Index_type k = 0; k < (TL_SZ + N - 1) / TL_SZ; k++) { + for (Index_type k = 0; k < (tile_size + N - 1) / tile_size; k++) { - MAT_MAT_SHARED_BODY_2 + MAT_MAT_SHARED_BODY_2(tile_size) __syncthreads(); - MAT_MAT_SHARED_BODY_3 + MAT_MAT_SHARED_BODY_3(tile_size) __syncthreads(); } - MAT_MAT_SHARED_BODY_4 + MAT_MAT_SHARED_BODY_4(tile_size) } -void MAT_MAT_SHARED::runCudaVariant(VariantID vid) { +template < size_t block_size > +void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) +{ + constexpr size_t tile_size = gpu_block_size::sqrt(block_size); + static_assert(tile_size*tile_size == block_size, "Invalid block_size"); const Index_type run_reps = getRunReps(); const Index_type N = m_N; - dim3 block_size(TL_SZ, TL_SZ); - dim3 grid_size(RAJA_DIVIDE_CEILING_INT(N, block_size.x), - RAJA_DIVIDE_CEILING_INT(N, block_size.y)); + dim3 blockDim(tile_size, tile_size); + dim3 gridDim(RAJA_DIVIDE_CEILING_INT(N, blockDim.x), + RAJA_DIVIDE_CEILING_INT(N, blockDim.y)); - const Index_type Nx = grid_size.x; - const Index_type Ny = grid_size.y; + const Index_type Nx = gridDim.x; + const Index_type Ny = gridDim.y; MAT_MAT_SHARED_DATA_SETUP; @@ -82,7 +86,7 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - mat_mat_shared<<>>(N, C, A, B); + mat_mat_shared<<>>(N, C, A, B); cudaErrchk( cudaGetLastError() ); } @@ -97,60 +101,60 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - lambda_cuda<<>>([=] __device__() { + lambda_cuda<<>>([=] __device__() { auto outer_y = [&](Index_type by) { auto outer_x = [&](Index_type bx) { - MAT_MAT_SHARED_BODY_0 + MAT_MAT_SHARED_BODY_0(tile_size) auto inner_y_1 = [&](Index_type ty) { - auto inner_x_1 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_1 }; + auto inner_x_1 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_1(tile_size) }; { Index_type tx = threadIdx.x; - if (tx < TL_SZ) + if (tx < tile_size) inner_x_1(tx); } }; { Index_type ty = threadIdx.y; - if (ty < TL_SZ) + if (ty < tile_size) inner_y_1(ty); } - for (Index_type k = 0; k < (TL_SZ + N - 1) / TL_SZ; ++k) { + for (Index_type k = 0; k < (tile_size + N - 1) / tile_size; ++k) { auto inner_y_2 = [&](Index_type ty) { - auto inner_x_2 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_2 }; + auto inner_x_2 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_2(tile_size) }; { Index_type tx = threadIdx.x; - if (tx < TL_SZ) + if (tx < tile_size) inner_x_2(tx); } }; { Index_type ty = threadIdx.y; - if (ty < TL_SZ) + if (ty < tile_size) inner_y_2(ty); } __syncthreads(); auto inner_y_3 = [&](Index_type ty) { - auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3 }; + auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3(tile_size) }; { Index_type tx = threadIdx.x; - if (tx < TL_SZ) + if (tx < tile_size) inner_x_3(tx); } }; { Index_type ty = threadIdx.y; - if (ty < TL_SZ) + if (ty < tile_size) inner_y_3(ty); } @@ -158,18 +162,18 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) { } auto inner_y_4 = [&](Index_type ty) { - auto inner_x_4 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_4 }; + auto inner_x_4 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_4(tile_size) }; { Index_type tx = threadIdx.x; - if (tx < TL_SZ) + if (tx < tile_size) inner_x_4(tx); } }; { Index_type ty = threadIdx.y; - if (ty < TL_SZ) + if (ty < tile_size) inner_y_4(ty); } }; // outer_x @@ -197,7 +201,7 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) { MAT_MAT_SHARED_DATA_SETUP_CUDA; using launch_policy = RAJA::expt::LaunchPolicy + ,RAJA::expt::cuda_launch_t >; using teams_x = RAJA::expt::LoopPolicy( RAJA::expt::DEVICE, RAJA::expt::Grid(RAJA::expt::Teams(Nx, Ny), - RAJA::expt::Threads(TL_SZ, TL_SZ)), + RAJA::expt::Threads(tile_size, tile_size)), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Ny), @@ -231,26 +235,26 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Nx), [&](Index_type bx) { - MAT_MAT_SHARED_BODY_0 + MAT_MAT_SHARED_BODY_0(tile_size) - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type ty) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_1 + MAT_MAT_SHARED_BODY_1(tile_size) } ); // RAJA::expt::loop } ); // RAJA::expt::loop - for (Index_type k = 0; k < (TL_SZ + N - 1) / TL_SZ; k++) { + for (Index_type k = 0; k < (tile_size + N - 1) / tile_size; k++) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type ty) { RAJA::expt::loop(ctx, - RAJA::RangeSegment(0, TL_SZ), + RAJA::RangeSegment(0, tile_size), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_2 + MAT_MAT_SHARED_BODY_2(tile_size) } ); // RAJA::expt::loop } @@ -258,11 +262,11 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type ty) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_3 + MAT_MAT_SHARED_BODY_3(tile_size) } ); // RAJA::expt::loop } @@ -272,11 +276,11 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) { } // for (k) - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type ty) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_4 + MAT_MAT_SHARED_BODY_4(tile_size) } ); // RAJA::expt::loop } @@ -301,6 +305,15 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) { } } +void MAT_MAT_SHARED::runCudaVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n MAT_MAT_SHARED : Unsupported Cuda block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/MAT_MAT_SHARED-Hip.cpp b/src/basic/MAT_MAT_SHARED-Hip.cpp index a13e53a51..a3f87e878 100644 --- a/src/basic/MAT_MAT_SHARED-Hip.cpp +++ b/src/basic/MAT_MAT_SHARED-Hip.cpp @@ -33,8 +33,8 @@ namespace basic { deallocHipDeviceData(B); \ deallocHipDeviceData(C); -template < size_t block_size > - __launch_bounds__(block_size) +template < size_t tile_size > + __launch_bounds__(tile_size*tile_size) __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A, Real_ptr B) { @@ -43,35 +43,39 @@ __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A, Index_type bx = blockIdx.x; Index_type by = blockIdx.y; - MAT_MAT_SHARED_BODY_0 + MAT_MAT_SHARED_BODY_0(tile_size) - MAT_MAT_SHARED_BODY_1 + MAT_MAT_SHARED_BODY_1(tile_size) - for (Index_type k = 0; k < (TL_SZ + N - 1) / TL_SZ; k++) { + for (Index_type k = 0; k < (tile_size + N - 1) / tile_size; k++) { - MAT_MAT_SHARED_BODY_2 + MAT_MAT_SHARED_BODY_2(tile_size) __syncthreads(); - MAT_MAT_SHARED_BODY_3 + MAT_MAT_SHARED_BODY_3(tile_size) __syncthreads(); } - MAT_MAT_SHARED_BODY_4 + MAT_MAT_SHARED_BODY_4(tile_size) } -void MAT_MAT_SHARED::runHipVariant(VariantID vid) { +template < size_t block_size > +void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) +{ + constexpr size_t tile_size = gpu_block_size::sqrt(block_size); + static_assert(tile_size*tile_size == block_size, "Invalid block_size"); const Index_type run_reps = getRunReps(); const Index_type N = m_N; - dim3 block_size(TL_SZ, TL_SZ); - dim3 grid_size(RAJA_DIVIDE_CEILING_INT(N, block_size.x), - RAJA_DIVIDE_CEILING_INT(N, block_size.y)); + dim3 blockDim(tile_size, tile_size); + dim3 gridDim(RAJA_DIVIDE_CEILING_INT(N, blockDim.x), + RAJA_DIVIDE_CEILING_INT(N, blockDim.y)); - const Index_type Nx = grid_size.x; - const Index_type Ny = grid_size.y; + const Index_type Nx = gridDim.x; + const Index_type Ny = gridDim.y; MAT_MAT_SHARED_DATA_SETUP; @@ -82,7 +86,7 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipLaunchKernelGGL((mat_mat_shared), dim3(grid_size), dim3(block_size), 0, 0, + hipLaunchKernelGGL((mat_mat_shared), dim3(gridDim), dim3(blockDim), 0, 0, N, C, A, B); hipErrchk( hipGetLastError() ); @@ -102,57 +106,57 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) { auto outer_y = [&](Index_type by) { auto outer_x = [&](Index_type bx) { - MAT_MAT_SHARED_BODY_0 + MAT_MAT_SHARED_BODY_0(tile_size) auto inner_y_1 = [&](Index_type ty) { - auto inner_x_1 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_1 }; + auto inner_x_1 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_1(tile_size) }; { Index_type tx = threadIdx.x; - if (tx < TL_SZ) + if (tx < tile_size) inner_x_1(tx); } }; { Index_type ty = threadIdx.y; - if (ty < TL_SZ) + if (ty < tile_size) inner_y_1(ty); } - for (Index_type k = 0; k < (TL_SZ + N - 1) / TL_SZ; ++k) { + for (Index_type k = 0; k < (tile_size + N - 1) / tile_size; ++k) { auto inner_y_2 = [&](Index_type ty) { - auto inner_x_2 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_2 }; + auto inner_x_2 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_2(tile_size) }; { Index_type tx = threadIdx.x; - if (tx < TL_SZ) + if (tx < tile_size) inner_x_2(tx); } }; { Index_type ty = threadIdx.y; - if (ty < TL_SZ) + if (ty < tile_size) inner_y_2(ty); } __syncthreads(); auto inner_y_3 = [&](Index_type ty) { - auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3 }; + auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3(tile_size) }; { Index_type tx = threadIdx.x; - if (tx < TL_SZ) + if (tx < tile_size) inner_x_3(tx); } }; { Index_type ty = threadIdx.y; - if (ty < TL_SZ) + if (ty < tile_size) inner_y_3(ty); } @@ -160,18 +164,18 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) { } auto inner_y_4 = [&](Index_type ty) { - auto inner_x_4 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_4 }; + auto inner_x_4 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_4(tile_size) }; { Index_type tx = threadIdx.x; - if (tx < TL_SZ) + if (tx < tile_size) inner_x_4(tx); } }; { Index_type ty = threadIdx.y; - if (ty < TL_SZ) + if (ty < tile_size) inner_y_4(ty); } }; // outer_x @@ -188,8 +192,8 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) { } }; - hipLaunchKernelGGL(lambda_hip, - grid_size, block_size, 0, 0, mat_mat_shared_lam); + hipLaunchKernelGGL((lambda_hip), + gridDim, blockDim, 0, 0, mat_mat_shared_lam); hipErrchk( hipGetLastError() ); } @@ -202,7 +206,7 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) { MAT_MAT_SHARED_DATA_SETUP_HIP; using launch_policy = RAJA::expt::LaunchPolicy + ,RAJA::expt::hip_launch_t >; using teams_x = RAJA::expt::LoopPolicy( RAJA::expt::DEVICE, RAJA::expt::Grid(RAJA::expt::Teams(Nx, Ny), - RAJA::expt::Threads(TL_SZ, TL_SZ)), + RAJA::expt::Threads(tile_size, tile_size)), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Ny), @@ -235,25 +239,25 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Nx), [&](Index_type bx) { - MAT_MAT_SHARED_BODY_0 + MAT_MAT_SHARED_BODY_0(tile_size) - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type ty) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_1 + MAT_MAT_SHARED_BODY_1(tile_size) } ); // RAJA::expt::loop } ); // RAJA::expt::loop - for (Index_type k = 0; k < (TL_SZ + N - 1) / TL_SZ; k++) { + for (Index_type k = 0; k < (tile_size + N - 1) / tile_size; k++) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type ty) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_2 + MAT_MAT_SHARED_BODY_2(tile_size) } ); // RAJA::expt::loop } @@ -261,11 +265,11 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type ty) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_3 + MAT_MAT_SHARED_BODY_3(tile_size) } ); // RAJA::expt::loop } @@ -275,11 +279,11 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) { } // for (k) - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type ty) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_4 + MAT_MAT_SHARED_BODY_4(tile_size) } ); // RAJA::expt::loop } @@ -304,6 +308,15 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) { } } +void MAT_MAT_SHARED::runHipVariant(VariantID vid) +{ + if ( !gpu_block_size::invoke_or( + gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { + std::cout << "\n MAT_MAT_SHARED : Unsupported Hip block_size " << getActualGPUBlockSize() + <<" for variant id = " << vid << std::endl; + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/MAT_MAT_SHARED-OMP.cpp b/src/basic/MAT_MAT_SHARED-OMP.cpp index c120646f6..0c46e7036 100644 --- a/src/basic/MAT_MAT_SHARED-OMP.cpp +++ b/src/basic/MAT_MAT_SHARED-OMP.cpp @@ -39,11 +39,11 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) { for (Index_type by = 0; by < Ny; ++by) { for (Index_type bx = 0; bx < Nx; ++bx) { - MAT_MAT_SHARED_BODY_0 + MAT_MAT_SHARED_BODY_0(TL_SZ) for (Index_type ty = 0; ty < TL_SZ; ++ty) { for (Index_type tx = 0; tx < TL_SZ; ++tx) { - MAT_MAT_SHARED_BODY_1 + MAT_MAT_SHARED_BODY_1(TL_SZ) } } @@ -52,21 +52,21 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) { for (Index_type ty = 0; ty < TL_SZ; ++ty) { for (Index_type tx = 0; tx < TL_SZ; ++tx) { - MAT_MAT_SHARED_BODY_2 + MAT_MAT_SHARED_BODY_2(TL_SZ) } } for (Index_type ty = 0; ty < TL_SZ; ++ty) { for (Index_type tx = 0; tx < TL_SZ; ++tx) { - MAT_MAT_SHARED_BODY_3 + MAT_MAT_SHARED_BODY_3(TL_SZ) } } } for (Index_type ty = 0; ty < TL_SZ; ++ty) { for (Index_type tx = 0; tx < TL_SZ; ++tx) { - MAT_MAT_SHARED_BODY_4 + MAT_MAT_SHARED_BODY_4(TL_SZ) } } } @@ -85,10 +85,10 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) { auto outer_y = [&](Index_type by) { auto outer_x = [&](Index_type bx) { - MAT_MAT_SHARED_BODY_0 + MAT_MAT_SHARED_BODY_0(TL_SZ) auto inner_y_1 = [&](Index_type ty) { - auto inner_x_1 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_1 }; + auto inner_x_1 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_1(TL_SZ) }; for (Index_type tx = 0; tx < TL_SZ; ++tx) { if (tx < TL_SZ) @@ -104,7 +104,7 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) { for (Index_type k = 0; k < (TL_SZ + N - 1) / TL_SZ; ++k) { auto inner_y_2 = [&](Index_type ty) { - auto inner_x_2 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_2 }; + auto inner_x_2 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_2(TL_SZ) }; for (Index_type tx = 0; tx < TL_SZ; ++tx) { inner_x_2(tx); @@ -116,7 +116,7 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) { } auto inner_y_3 = [&](Index_type ty) { - auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3 }; + auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3(TL_SZ) }; for (Index_type tx = 0; tx < TL_SZ; ++tx) { inner_x_3(tx); @@ -129,7 +129,7 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) { } auto inner_y_4 = [&](Index_type ty) { - auto inner_x_4 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_4 }; + auto inner_x_4 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_4(TL_SZ) }; for (Index_type tx = 0; tx < TL_SZ; ++tx) { inner_x_4(tx); @@ -196,18 +196,18 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) { RAJA::expt::launch(RAJA::expt::HOST, RAJA::expt::Grid(), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Ny), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Ny), [&](Index_type by) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Nx), [&](Index_type bx) { - MAT_MAT_SHARED_BODY_0 + MAT_MAT_SHARED_BODY_0(TL_SZ) RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type ty) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_1 + MAT_MAT_SHARED_BODY_1(TL_SZ) } ); // RAJA::expt::loop } @@ -219,7 +219,7 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) { [&](Index_type ty) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_2 + MAT_MAT_SHARED_BODY_2(TL_SZ) } ); // RAJA::expt::loop } @@ -231,7 +231,7 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) { [&](Index_type ty) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_3 + MAT_MAT_SHARED_BODY_3(TL_SZ) } ); // RAJA::expt::loop } @@ -245,7 +245,7 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) { [&](Index_type ty) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_4 + MAT_MAT_SHARED_BODY_4(TL_SZ) } ); // RAJA::expt::loop } @@ -253,13 +253,13 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) { } // lambda (bx) ); // RAJA::expt::loop - } // lambda (by) + } // lambda (by) ); // RAJA::expt::loop } // outer lambda (ctx) - ); // RAJA::expt::launch + ); // RAJA::expt::launch - } // loop over kernel reps + } // loop over kernel reps stopTimer(); break; @@ -271,7 +271,7 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) { } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/basic/MAT_MAT_SHARED-Seq.cpp b/src/basic/MAT_MAT_SHARED-Seq.cpp index 00119d1b3..10e7299de 100644 --- a/src/basic/MAT_MAT_SHARED-Seq.cpp +++ b/src/basic/MAT_MAT_SHARED-Seq.cpp @@ -34,11 +34,11 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) { //Work around for when compiling with CLANG and HIP //See notes in MAT_MAT_SHARED.hpp - MAT_MAT_SHARED_BODY_0_CLANG_HIP_CPU + MAT_MAT_SHARED_BODY_0_CLANG_HIP_CPU(TL_SZ) for (Index_type ty = 0; ty < TL_SZ; ++ty) { for (Index_type tx = 0; tx < TL_SZ; ++tx) { - MAT_MAT_SHARED_BODY_1 + MAT_MAT_SHARED_BODY_1(TL_SZ) } } @@ -46,13 +46,13 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) { for (Index_type ty = 0; ty < TL_SZ; ++ty) { for (Index_type tx = 0; tx < TL_SZ; ++tx) { - MAT_MAT_SHARED_BODY_2 + MAT_MAT_SHARED_BODY_2(TL_SZ) } } for (Index_type ty = 0; ty < TL_SZ; ++ty) { for (Index_type tx = 0; tx < TL_SZ; ++tx) { - MAT_MAT_SHARED_BODY_3 + MAT_MAT_SHARED_BODY_3(TL_SZ) } } @@ -60,7 +60,7 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) { for (Index_type ty = 0; ty < TL_SZ; ++ty) { for (Index_type tx = 0; tx < TL_SZ; ++tx) { - MAT_MAT_SHARED_BODY_4 + MAT_MAT_SHARED_BODY_4(TL_SZ) } } } @@ -82,10 +82,10 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) { auto outer_y = [&](Index_type by) { auto outer_x = [&](Index_type bx) { - MAT_MAT_SHARED_BODY_0_CLANG_HIP_CPU + MAT_MAT_SHARED_BODY_0_CLANG_HIP_CPU(TL_SZ) auto inner_y_1 = [&](Index_type ty) { - auto inner_x_1 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_1 }; + auto inner_x_1 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_1(TL_SZ) }; for (Index_type tx = 0; tx < TL_SZ; ++tx) { if (tx < TL_SZ) @@ -101,7 +101,7 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) { for (Index_type k = 0; k < (TL_SZ + N - 1) / TL_SZ; ++k) { auto inner_y_2 = [&](Index_type ty) { - auto inner_x_2 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_2 }; + auto inner_x_2 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_2(TL_SZ) }; for (Index_type tx = 0; tx < TL_SZ; ++tx) { inner_x_2(tx); @@ -113,7 +113,7 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) { } auto inner_y_3 = [&](Index_type ty) { - auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3 }; + auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3(TL_SZ) }; for (Index_type tx = 0; tx < TL_SZ; ++tx) { inner_x_3(tx); @@ -126,7 +126,7 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) { } auto inner_y_4 = [&](Index_type ty) { - auto inner_x_4 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_4 }; + auto inner_x_4 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_4(TL_SZ) }; for (Index_type tx = 0; tx < TL_SZ; ++tx) { inner_x_4(tx); @@ -193,19 +193,19 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) { RAJA::expt::launch(RAJA::expt::HOST, RAJA::expt::Grid(), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Ny), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Ny), [&](Index_type by) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Nx), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Nx), [&](Index_type bx) { - MAT_MAT_SHARED_BODY_0 + MAT_MAT_SHARED_BODY_0(TL_SZ) RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type ty) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_1 + MAT_MAT_SHARED_BODY_1(TL_SZ) } ); // RAJA::expt::loop } @@ -217,7 +217,7 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) { [&](Index_type ty) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_2 + MAT_MAT_SHARED_BODY_2(TL_SZ) } ); // RAJA::expt::loop } @@ -229,9 +229,9 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) { [&](Index_type ty) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_3 + MAT_MAT_SHARED_BODY_3(TL_SZ) } - ); // RAJA::expt::loop + ); // RAJA::expt::loop } ); // RAJA::expt::loop @@ -243,7 +243,7 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) { [&](Index_type ty) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_4 + MAT_MAT_SHARED_BODY_4(TL_SZ) } ); // RAJA::expt::loop } diff --git a/src/basic/MAT_MAT_SHARED.cpp b/src/basic/MAT_MAT_SHARED.cpp index 79cd2b49c..91f19a42b 100644 --- a/src/basic/MAT_MAT_SHARED.cpp +++ b/src/basic/MAT_MAT_SHARED.cpp @@ -20,7 +20,7 @@ namespace basic { MAT_MAT_SHARED::MAT_MAT_SHARED(const RunParams ¶ms) : KernelBase(rajaperf::Basic_MAT_MAT_SHARED, params) { - setDefaultGPUBlockSize( default_gpu_block_size ); + setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() : getDefaultGPUBlockSize() ); @@ -87,5 +87,11 @@ void MAT_MAT_SHARED::tearDown(VariantID vid) { deallocData(m_C); } +bool MAT_MAT_SHARED::isGPUBlockSizeSupported() const +{ + return gpu_block_size::invoke_or( + gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/MAT_MAT_SHARED.hpp b/src/basic/MAT_MAT_SHARED.hpp index 50012da23..08dfd4ece 100644 --- a/src/basic/MAT_MAT_SHARED.hpp +++ b/src/basic/MAT_MAT_SHARED.hpp @@ -84,37 +84,38 @@ constexpr rajaperf::Index_type TL_SZ = 16; functions. Nvcc doesn't look at host only code when it does the device pass so it doesn't see these kind of problems. */ -#define MAT_MAT_SHARED_BODY_0_CLANG_HIP_CPU \ - double As[TL_SZ][TL_SZ]; \ - double Bs[TL_SZ][TL_SZ]; \ - double Cs[TL_SZ][TL_SZ]; - -#define MAT_MAT_SHARED_BODY_0 \ - RAJA_TEAM_SHARED double As[TL_SZ][TL_SZ]; \ - RAJA_TEAM_SHARED double Bs[TL_SZ][TL_SZ]; \ - RAJA_TEAM_SHARED double Cs[TL_SZ][TL_SZ]; - -#define MAT_MAT_SHARED_BODY_1 Cs[ty][tx] = 0; - -#define MAT_MAT_SHARED_BODY_2 \ - const Index_type Row = by * TL_SZ + ty; \ - const Index_type Col = bx * TL_SZ + tx; \ - if (k * TL_SZ + tx < N && Row < N) \ - As[ty][tx] = A[Row * N + k * TL_SZ + tx]; \ +#define MAT_MAT_SHARED_BODY_0_CLANG_HIP_CPU(tile_size) \ + double As[tile_size][tile_size]; \ + double Bs[tile_size][tile_size]; \ + double Cs[tile_size][tile_size]; + +#define MAT_MAT_SHARED_BODY_0(tile_size) \ + RAJA_TEAM_SHARED double As[tile_size][tile_size]; \ + RAJA_TEAM_SHARED double Bs[tile_size][tile_size]; \ + RAJA_TEAM_SHARED double Cs[tile_size][tile_size]; + +#define MAT_MAT_SHARED_BODY_1(tile_size) \ + Cs[ty][tx] = 0; + +#define MAT_MAT_SHARED_BODY_2(tile_size) \ + const Index_type Row = by * tile_size + ty; \ + const Index_type Col = bx * tile_size + tx; \ + if (k * tile_size + tx < N && Row < N) \ + As[ty][tx] = A[Row * N + k * tile_size + tx]; \ else \ As[ty][tx] = 0.0; \ - if (k * TL_SZ + ty < N && Col < N) \ - Bs[ty][tx] = B[(k * TL_SZ + ty) * N + Col]; \ + if (k * tile_size + ty < N && Col < N) \ + Bs[ty][tx] = B[(k * tile_size + ty) * N + Col]; \ else \ Bs[ty][tx] = 0.0; -#define MAT_MAT_SHARED_BODY_3 \ - for (Index_type n = 0; n < TL_SZ; ++n) \ +#define MAT_MAT_SHARED_BODY_3(tile_size) \ + for (Index_type n = 0; n < tile_size; ++n) \ Cs[ty][tx] += As[ty][n] * Bs[n][tx]; -#define MAT_MAT_SHARED_BODY_4 \ - const Index_type Row = by * TL_SZ + ty; \ - const Index_type Col = bx * TL_SZ + tx; \ +#define MAT_MAT_SHARED_BODY_4(tile_size) \ + const Index_type Row = by * tile_size + ty; \ + const Index_type Col = bx * tile_size + tx; \ if (Row < N && Col < N) \ C[Col + N * Row] = Cs[ty][tx]; @@ -155,8 +156,15 @@ class MAT_MAT_SHARED : public KernelBase { void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + bool isGPUBlockSizeSupported() const; + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: static const size_t default_gpu_block_size = TL_SZ * TL_SZ; + using gpu_block_sizes_type = gpu_block_size::list_type; Real_ptr m_A; Real_ptr m_B; diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 07eff4e8f..b8dd68cea 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -26,6 +26,17 @@ namespace gpu_block_size namespace detail { +// implementation of sqrt via binary search +// copied from https://stackoverflow.com/questions/8622256/in-c11-is-sqrt-defined-as-constexpr +constexpr size_t sqrt_helper(size_t n, size_t lo, size_t hi) +{ + return (lo == hi) + ? lo + : ((n / ((lo + hi + 1) / 2) < ((lo + hi + 1) / 2)) + ? sqrt_helper(n, lo, ((lo + hi + 1) / 2)-1) + : sqrt_helper(n, ((lo + hi + 1) / 2), hi)); +} + // helpers to invoke f with each integer in the param pack template < typename F > bool invoke_or_helper(F) @@ -49,8 +60,52 @@ struct SizeOfIntSeq> static const size_t size = sizeof...(Is); }; +// class to help prepend integers to a list +// this is used for the false case where I is not prepended to IntSeq +template < bool B, size_t I, typename IntSeq > +struct conditional_prepend +{ + using type = IntSeq; +}; +/// this is used for the true case where I is prepended to IntSeq +template < size_t I, size_t... Is > +struct conditional_prepend> +{ + using type = camp::int_seq; +}; + +// class to help create a sequence that is only the valid values in IntSeq +template < typename validity_checker, typename IntSeq > +struct remove_invalid; + +// base case where the list is empty, use the empty list +template < typename validity_checker > +struct remove_invalid> +{ + using type = camp::int_seq; +}; + +// check validity of I and conditionally prepend I to a recursively generated +// list of valid values +template < typename validity_checker, size_t I, size_t... Is > +struct remove_invalid> +{ + using type = typename conditional_prepend< + validity_checker::template valid(), + I, + typename remove_invalid>::type + >::type; +}; + + } // namespace detail +// constexpr implementation of integer sqrt +constexpr size_t sqrt(size_t n) +{ + return detail::sqrt_helper(n, 0, n/2 + 1); +} + // call f's call operator with each integer as the template param in turn // stopping at the first integer that returns true. // return true if any f() returns true, otherwise return false @@ -125,23 +180,53 @@ struct RunHipBlockSize // return default_I if it is in sizes or the first integer in sizes otherwise template < size_t I, size_t... Is > -size_t get_default_or_first(size_t default_I, camp::int_seq sizes) +inline size_t get_default_or_first(size_t default_I, camp::int_seq sizes) { if (invoke_or(Equals(default_I), sizes)) { return default_I; } return I; } +/// base case when sizes is empty +inline size_t get_default_or_first(size_t, camp::int_seq) +{ + return 0; +} + +// always true +struct AllowAny +{ + template < size_t I > + static constexpr bool valid() { return true; } +}; + +// true if of I is a multiple of N, false otherwise +template < size_t N > +struct MultipleOf +{ + template < size_t I > + static constexpr bool valid() { return (I/N)*N == I; } +}; + +// true if the sqrt of I is representable as a size_t, false otherwise +struct ExactSqrt +{ + template < size_t I > + static constexpr bool valid() { return sqrt(I)*sqrt(I) == I; } +}; // A camp::int_seq of size_t's that is rajaperf::configuration::gpu_block_sizes // if rajaperf::configuration::gpu_block_sizes is not empty // and a camp::int_seq of default_block_size otherwise -template < size_t default_block_size > +// with invalid entries removed according to validity_checker +template < size_t default_block_size, typename validity_checker = AllowAny > using list_type = - typename std::conditional< (detail::SizeOfIntSeq::size > 0), - rajaperf::configuration::gpu_block_sizes, - camp::int_seq - >::type; + typename detail::remove_invalid::size > 0), + rajaperf::configuration::gpu_block_sizes, + camp::int_seq + >::type + >::type; } // closing brace for gpu_block_size namespace From 783e5badca95352c16c42f9936d4751abc616ca5 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 25 Oct 2021 15:43:10 -0700 Subject: [PATCH 155/392] Add validity clause to multi dimensional gpu kernels --- src/apps/LTIMES.hpp | 3 ++- src/apps/LTIMES_NOVIEW.hpp | 3 ++- src/basic/NESTED_INIT.hpp | 3 ++- src/lcals/HYDRO_2D.hpp | 3 ++- src/polybench/POLYBENCH_2MM.hpp | 3 ++- src/polybench/POLYBENCH_3MM.hpp | 3 ++- src/polybench/POLYBENCH_FDTD_2D.hpp | 3 ++- src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp | 3 ++- src/polybench/POLYBENCH_GEMM.hpp | 3 ++- src/polybench/POLYBENCH_GEMVER.hpp | 3 ++- src/polybench/POLYBENCH_HEAT_3D.hpp | 3 ++- src/polybench/POLYBENCH_JACOBI_2D.hpp | 3 ++- 12 files changed, 24 insertions(+), 12 deletions(-) diff --git a/src/apps/LTIMES.hpp b/src/apps/LTIMES.hpp index c640dfe2d..7b21a111b 100644 --- a/src/apps/LTIMES.hpp +++ b/src/apps/LTIMES.hpp @@ -125,7 +125,8 @@ class LTIMES : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::list_type>; Real_ptr m_phidat; Real_ptr m_elldat; diff --git a/src/apps/LTIMES_NOVIEW.hpp b/src/apps/LTIMES_NOVIEW.hpp index 7565a2640..e341936f2 100644 --- a/src/apps/LTIMES_NOVIEW.hpp +++ b/src/apps/LTIMES_NOVIEW.hpp @@ -75,7 +75,8 @@ class LTIMES_NOVIEW : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::list_type>; Real_ptr m_phidat; Real_ptr m_elldat; diff --git a/src/basic/NESTED_INIT.hpp b/src/basic/NESTED_INIT.hpp index 880ce9549..2e711cfdd 100644 --- a/src/basic/NESTED_INIT.hpp +++ b/src/basic/NESTED_INIT.hpp @@ -67,7 +67,8 @@ class NESTED_INIT : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::list_type>; Index_type m_array_length; diff --git a/src/lcals/HYDRO_2D.hpp b/src/lcals/HYDRO_2D.hpp index cd62b66f9..a5f43f7f2 100644 --- a/src/lcals/HYDRO_2D.hpp +++ b/src/lcals/HYDRO_2D.hpp @@ -162,7 +162,8 @@ class HYDRO_2D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::list_type>; Real_ptr m_za; Real_ptr m_zb; diff --git a/src/polybench/POLYBENCH_2MM.hpp b/src/polybench/POLYBENCH_2MM.hpp index 654a3e306..6b2ae7c94 100644 --- a/src/polybench/POLYBENCH_2MM.hpp +++ b/src/polybench/POLYBENCH_2MM.hpp @@ -136,7 +136,8 @@ class POLYBENCH_2MM : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::list_type>; Index_type m_ni; Index_type m_nj; diff --git a/src/polybench/POLYBENCH_3MM.hpp b/src/polybench/POLYBENCH_3MM.hpp index 195384b1c..393c24225 100644 --- a/src/polybench/POLYBENCH_3MM.hpp +++ b/src/polybench/POLYBENCH_3MM.hpp @@ -162,7 +162,8 @@ class POLYBENCH_3MM : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::list_type>; Index_type m_ni; Index_type m_nj; diff --git a/src/polybench/POLYBENCH_FDTD_2D.hpp b/src/polybench/POLYBENCH_FDTD_2D.hpp index 50e072fea..10d637690 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.hpp +++ b/src/polybench/POLYBENCH_FDTD_2D.hpp @@ -122,7 +122,8 @@ class POLYBENCH_FDTD_2D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::list_type>; Index_type m_nx; Index_type m_ny; diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp index d7c81df0a..7c4c06d81 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp @@ -85,7 +85,8 @@ class POLYBENCH_FLOYD_WARSHALL : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::list_type>; Index_type m_N; diff --git a/src/polybench/POLYBENCH_GEMM.hpp b/src/polybench/POLYBENCH_GEMM.hpp index 65bddc907..0348a5db0 100644 --- a/src/polybench/POLYBENCH_GEMM.hpp +++ b/src/polybench/POLYBENCH_GEMM.hpp @@ -108,7 +108,8 @@ class POLYBENCH_GEMM : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::list_type>; Index_type m_ni; Index_type m_nj; diff --git a/src/polybench/POLYBENCH_GEMVER.hpp b/src/polybench/POLYBENCH_GEMVER.hpp index f56ba1b02..6388d7273 100644 --- a/src/polybench/POLYBENCH_GEMVER.hpp +++ b/src/polybench/POLYBENCH_GEMVER.hpp @@ -161,7 +161,8 @@ class POLYBENCH_GEMVER : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::list_type>; Index_type m_n; Real_type m_alpha; diff --git a/src/polybench/POLYBENCH_HEAT_3D.hpp b/src/polybench/POLYBENCH_HEAT_3D.hpp index bb3189f7d..06592b5a4 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.hpp +++ b/src/polybench/POLYBENCH_HEAT_3D.hpp @@ -133,7 +133,8 @@ class POLYBENCH_HEAT_3D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::list_type>; Index_type m_N; Index_type m_tsteps; diff --git a/src/polybench/POLYBENCH_JACOBI_2D.hpp b/src/polybench/POLYBENCH_JACOBI_2D.hpp index ce063836b..7bbc2dbc6 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.hpp @@ -99,7 +99,8 @@ class POLYBENCH_JACOBI_2D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::list_type>; Index_type m_N; Index_type m_tsteps; From 043554a1550a2dd6db564121eeb3c24445dce9a4 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 25 Oct 2021 16:21:26 -0700 Subject: [PATCH 156/392] Ensure gpu warmup kernels happen Its possible to pick a gpu_block_size that causes the warmup gpu kernels not to run while allowing other gpu kernels to run. To avoid running gpu kernels without any gpu warmup, check if the gpu warmup kernels can run and use the default gpu_block_size if they can not run. --- src/common/Executor.cpp | 25 +++++++++++++++++++++---- src/common/Executor.hpp | 3 +++ src/common/RunParams.hpp | 1 + 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index f5a2a7653..12e49e9c7 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -724,10 +724,10 @@ void Executor::runSuite() vector warmup_kernels; - warmup_kernels.push_back(new basic::DAXPY(run_params)); - warmup_kernels.push_back(new basic::REDUCE3_INT(run_params)); - warmup_kernels.push_back(new algorithm::SORT(run_params)); - warmup_kernels.push_back(new apps::HALOEXCHANGE_FUSED(run_params)); + warmup_kernels.push_back(makeKernel()); + warmup_kernels.push_back(makeKernel()); + warmup_kernels.push_back(makeKernel()); + warmup_kernels.push_back(makeKernel()); for (size_t ik = 0; ik < warmup_kernels.size(); ++ik) { KernelBase* warmup_kernel = warmup_kernels[ik]; @@ -758,6 +758,23 @@ void Executor::runSuite() } +template < typename Kernel > +KernelBase* Executor::makeKernel() +{ + Kernel* kernel = new Kernel(run_params); + // check gpu block size in run_params is supported by kernel + if (!kernel->isGPUBlockSizeSupported() && + run_params.getGPUBlockSize() != 0) { + // make Kernel with default gpu block size + delete kernel; kernel = nullptr; + size_t block_size = run_params.getGPUBlockSize(); + run_params.setGPUBlockSize(0); + kernel = new Kernel(run_params); + run_params.setGPUBlockSize(block_size); + } + return kernel; +} + void Executor::runKernel(KernelBase* kern) { for (size_t iv = 0; iv < variant_ids.size(); ++iv) { diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp index 9f588c7a7..ebf1cf2d1 100644 --- a/src/common/Executor.hpp +++ b/src/common/Executor.hpp @@ -58,6 +58,9 @@ class Executor std::vector variants; }; + template < typename Kernel > + KernelBase* makeKernel(); + void runKernel(KernelBase* kern); bool haveReferenceVariant() { return reference_vid < NumVariants; } diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index aa4633b94..84a677ef2 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -98,6 +98,7 @@ class RunParams { double getSizeFactor() const { return size_factor; } size_t getGPUBlockSize() const { return gpu_block_size; } + void setGPUBlockSize(size_t block_size) { gpu_block_size = block_size; } double getPFTolerance() const { return pf_tol; } From d8d41895f15cb066e9f5866daf2bde8acd3bfe3b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 28 Oct 2021 10:38:32 -0700 Subject: [PATCH 157/392] Simplify GPUUtils minorly --- src/common/GPUUtils.hpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index b8dd68cea..1cafa20c0 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -47,7 +47,7 @@ bool invoke_or_helper(F) template < typename F, size_t I, size_t... Is> bool invoke_or_helper(F f) { - return f.template operator()() || invoke_or_helper(f); + return f(camp::int_seq()) || invoke_or_helper(f); } // class to get the size of a camp::int_seq @@ -124,7 +124,8 @@ struct Equals {} template < size_t block_size > - bool operator()() { return m_actual_gpu_block_size == block_size; } + bool operator()(camp::int_seq) const + { return m_actual_gpu_block_size == block_size; } private: size_t m_actual_gpu_block_size; @@ -141,7 +142,8 @@ struct RunCudaBlockSize {} template < size_t block_size > - bool operator()() { + bool operator()(camp::int_seq) const + { if (block_size == m_kernel.getActualGPUBlockSize()) { m_kernel.template runCudaVariantImpl(m_vid); return true; @@ -165,7 +167,8 @@ struct RunHipBlockSize {} template < size_t block_size > - bool operator()() { + bool operator()(camp::int_seq) const + { if (block_size == m_kernel.getActualGPUBlockSize()) { m_kernel.template runHipVariantImpl(m_vid); return true; From a1bfe902a42a83815565f051e4a150f2074ed88e Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 28 Oct 2021 13:56:01 -0700 Subject: [PATCH 158/392] Improve LTIMES gpu block shapes Use squarest y, z combination after using 32 for x. --- src/apps/LTIMES-Cuda.cpp | 10 ++-------- src/apps/LTIMES-Hip.cpp | 10 ++-------- src/apps/LTIMES.hpp | 2 +- src/apps/LTIMES_NOVIEW-Cuda.cpp | 10 ++-------- src/apps/LTIMES_NOVIEW-Hip.cpp | 10 ++-------- src/apps/LTIMES_NOVIEW.hpp | 2 +- src/common/GPUUtils.hpp | 33 +++++++++++++++++++++++++++++---- 7 files changed, 39 insertions(+), 38 deletions(-) diff --git a/src/apps/LTIMES-Cuda.cpp b/src/apps/LTIMES-Cuda.cpp index 1feedb880..1bedf7932 100644 --- a/src/apps/LTIMES-Cuda.cpp +++ b/src/apps/LTIMES-Cuda.cpp @@ -25,14 +25,8 @@ namespace apps // Define thread block shape for CUDA execution // #define m_block_sz (32) -// Note that z_block_sz = 2 is done for expedience, but -// ideally we would find g_block_sz, z_block_sz -// whole number factors of block_size / m_block_sz where -// g_block_sz * z_block_sz == block_size / m_block_sz, -// g_block_sz >= z_block_sz, and -// g_block_sz - z_block_sz is minimized -#define z_block_sz (2) -#define g_block_sz (block_size / m_block_sz / z_block_sz) +#define g_block_sz (gpu_block_size::greater_of_squarest_factor_pair(block_size/m_block_sz)) +#define z_block_sz (gpu_block_size::lesser_of_squarest_factor_pair(block_size/m_block_sz)) #define LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \ m_block_sz, g_block_sz, z_block_sz diff --git a/src/apps/LTIMES-Hip.cpp b/src/apps/LTIMES-Hip.cpp index fe2e184fd..85a46c3f1 100644 --- a/src/apps/LTIMES-Hip.cpp +++ b/src/apps/LTIMES-Hip.cpp @@ -25,14 +25,8 @@ namespace apps // Define thread block shape for Hip execution // #define m_block_sz (32) -// Note that z_block_sz = 2 is done for expedience, but -// ideally we would find g_block_sz, z_block_sz -// whole number factors of block_size / m_block_sz where -// g_block_sz * z_block_sz == block_size / m_block_sz, -// g_block_sz >= z_block_sz, and -// g_block_sz - z_block_sz is minimized -#define z_block_sz (2) -#define g_block_sz (block_size / m_block_sz / z_block_sz) +#define g_block_sz (gpu_block_size::greater_of_squarest_factor_pair(block_size/m_block_sz)) +#define z_block_sz (gpu_block_size::lesser_of_squarest_factor_pair(block_size/m_block_sz)) #define LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \ m_block_sz, g_block_sz, z_block_sz diff --git a/src/apps/LTIMES.hpp b/src/apps/LTIMES.hpp index 7b21a111b..b0b308397 100644 --- a/src/apps/LTIMES.hpp +++ b/src/apps/LTIMES.hpp @@ -126,7 +126,7 @@ class LTIMES : public KernelBase private: static const size_t default_gpu_block_size = 256; using gpu_block_sizes_type = gpu_block_size::list_type>; + gpu_block_size::MultipleOf<32>>; Real_ptr m_phidat; Real_ptr m_elldat; diff --git a/src/apps/LTIMES_NOVIEW-Cuda.cpp b/src/apps/LTIMES_NOVIEW-Cuda.cpp index 0253c58a8..7849f6782 100644 --- a/src/apps/LTIMES_NOVIEW-Cuda.cpp +++ b/src/apps/LTIMES_NOVIEW-Cuda.cpp @@ -25,14 +25,8 @@ namespace apps // Define thread block shape for CUDA execution // #define m_block_sz (32) -// Note that z_block_sz = 2 is done for expedience, but -// ideally we would find g_block_sz, z_block_sz -// whole number factors of block_size / m_block_sz where -// g_block_sz * z_block_sz == block_size / m_block_sz, -// g_block_sz >= z_block_sz, and -// g_block_sz - z_block_sz is minimized -#define z_block_sz (2) -#define g_block_sz (block_size / m_block_sz / z_block_sz) +#define g_block_sz (gpu_block_size::greater_of_squarest_factor_pair(block_size/m_block_sz)) +#define z_block_sz (gpu_block_size::lesser_of_squarest_factor_pair(block_size/m_block_sz)) #define LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \ m_block_sz, g_block_sz, z_block_sz diff --git a/src/apps/LTIMES_NOVIEW-Hip.cpp b/src/apps/LTIMES_NOVIEW-Hip.cpp index 7061ff224..f420af6bb 100644 --- a/src/apps/LTIMES_NOVIEW-Hip.cpp +++ b/src/apps/LTIMES_NOVIEW-Hip.cpp @@ -25,14 +25,8 @@ namespace apps // Define thread block shape for Hip execution // #define m_block_sz (32) -// Note that z_block_sz = 2 is done for expedience, but -// ideally we would find g_block_sz, z_block_sz -// whole number factors of block_size / m_block_sz where -// g_block_sz * z_block_sz == block_size / m_block_sz, -// g_block_sz >= z_block_sz, and -// g_block_sz - z_block_sz is minimized -#define z_block_sz (2) -#define g_block_sz (block_size / m_block_sz / z_block_sz) +#define g_block_sz (gpu_block_size::greater_of_squarest_factor_pair(block_size/m_block_sz)) +#define z_block_sz (gpu_block_size::lesser_of_squarest_factor_pair(block_size/m_block_sz)) #define LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \ m_block_sz, g_block_sz, z_block_sz diff --git a/src/apps/LTIMES_NOVIEW.hpp b/src/apps/LTIMES_NOVIEW.hpp index e341936f2..a5d1fc908 100644 --- a/src/apps/LTIMES_NOVIEW.hpp +++ b/src/apps/LTIMES_NOVIEW.hpp @@ -76,7 +76,7 @@ class LTIMES_NOVIEW : public KernelBase private: static const size_t default_gpu_block_size = 256; using gpu_block_sizes_type = gpu_block_size::list_type>; + gpu_block_size::MultipleOf<32>>; Real_ptr m_phidat; Real_ptr m_elldat; diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 1cafa20c0..560801ace 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -31,10 +31,18 @@ namespace detail constexpr size_t sqrt_helper(size_t n, size_t lo, size_t hi) { return (lo == hi) - ? lo + ? lo // search complete : ((n / ((lo + hi + 1) / 2) < ((lo + hi + 1) / 2)) - ? sqrt_helper(n, lo, ((lo + hi + 1) / 2)-1) - : sqrt_helper(n, ((lo + hi + 1) / 2), hi)); + ? sqrt_helper(n, lo, ((lo + hi + 1) / 2)-1) // search lower half + : sqrt_helper(n, ((lo + hi + 1) / 2), hi)); // search upper half +} + +// implementation of lesser_of_squarest_factor_pair via linear search +constexpr size_t lesser_of_squarest_factor_pair_helper(size_t n, size_t guess) +{ + return ((n / guess) * guess == n) + ? guess // search complete, guess is a factor + : lesser_of_squarest_factor_pair_helper(n, guess - 1); // continue searching } // helpers to invoke f with each integer in the param pack @@ -100,12 +108,29 @@ struct remove_invalid> } // namespace detail -// constexpr implementation of integer sqrt +// constexpr integer sqrt constexpr size_t sqrt(size_t n) { return detail::sqrt_helper(n, 0, n/2 + 1); } +// constexpr return the lesser of the most square pair of factors of n +// ex. 12 has pairs of factors (1, 12) (2, 6) *(3, 4)* and returns 3 +constexpr size_t lesser_of_squarest_factor_pair(size_t n) +{ + return (n == 0) + ? 0 // return 0 in the 0 case + : detail::lesser_of_squarest_factor_pair_helper(n, sqrt(n)); +} +// constexpr return the greater of the most square pair of factors of n +// ex. 12 has pairs of factors (1, 12) (2, 6) *(3, 4)* and returns 4 +constexpr size_t greater_of_squarest_factor_pair(size_t n) +{ + return (n == 0) + ? 0 // return 0 in the 0 case + : n / detail::lesser_of_squarest_factor_pair_helper(n, sqrt(n)); +} + // call f's call operator with each integer as the template param in turn // stopping at the first integer that returns true. // return true if any f() returns true, otherwise return false From 925e187e363cb83afe1b97ea8e4c91b000250a26 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 2 Nov 2021 09:28:38 -0700 Subject: [PATCH 159/392] Optionally output npasses min,max,ave timing Have the option to output the min, max, and/or average of timing data using --npasses-combiner argument. Timing and speedup csv files append the combiner method to their filename and print the combiner method in the file. --- src/common/Executor.cpp | 89 +++++++++++++++++++++++++++++++++++----- src/common/Executor.hpp | 4 +- src/common/RunParams.cpp | 43 ++++++++++++++++++- src/common/RunParams.hpp | 41 ++++++++++++++++++ 4 files changed, 163 insertions(+), 14 deletions(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index cc32c6cf6..71907794b 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -60,9 +60,38 @@ void Executor::setupSuite() using Slist = list; using Svector = vector; + using COvector = vector; using KIDset = set; using VIDset = set; + // + // Determine which kernels to exclude from input. + // exclude_kern will be non-duplicated ordered set of IDs of kernel to exclude. + // + const Svector& npasses_combiner_input = run_params.getNpassesCombinerOptInput(); + if ( !npasses_combiner_input.empty() ) { + + COvector combiners; + Svector invalid; + for (const std::string& combiner_name : npasses_combiner_input) { + + if (combiner_name == RunParams::CombinerOptToStr(RunParams::CombinerOpt::Average)) { + combiners.emplace_back(RunParams::CombinerOpt::Average); + } else if (combiner_name == RunParams::CombinerOptToStr(RunParams::CombinerOpt::Minimum)) { + combiners.emplace_back(RunParams::CombinerOpt::Minimum); + } else if (combiner_name == RunParams::CombinerOptToStr(RunParams::CombinerOpt::Maximum)) { + combiners.emplace_back(RunParams::CombinerOpt::Maximum); + } else { + invalid.emplace_back(combiner_name); + } + + } + + run_params.setNpassesCombinerOpts(combiners); + run_params.setInvalidNpassesCombinerOptInput(invalid); + + } + // // Determine which kernels to exclude from input. // exclude_kern will be non-duplicated ordered set of IDs of kernel to exclude. @@ -476,8 +505,12 @@ void Executor::setupSuite() // A message will be emitted later so user can sort it out... // - if ( !(run_params.getInvalidKernelInput().empty()) || - !(run_params.getInvalidExcludeKernelInput().empty()) ) { + if ( !(run_params.getInvalidNpassesCombinerOptInput().empty()) ) { + + run_params.setInputState(RunParams::BadInput); + + } else if ( !(run_params.getInvalidKernelInput().empty()) || + !(run_params.getInvalidExcludeKernelInput().empty()) ) { run_params.setInputState(RunParams::BadInput); @@ -789,12 +822,16 @@ void Executor::outputRunData() } out_fprefix = "./" + run_params.getOutputFilePrefix(); - string filename = out_fprefix + "-timing.csv"; - writeCSVReport(filename, CSVRepMode::Timing, 6 /* prec */); + string filename; + + for (RunParams::CombinerOpt combiner : run_params.getNpassesCombinerOpts()) { + filename = out_fprefix + "-timing-" + RunParams::CombinerOptToStr(combiner) + ".csv"; + writeCSVReport(filename, CSVRepMode::Timing, combiner, 6 /* prec */); - if ( haveReferenceVariant() ) { - filename = out_fprefix + "-speedup.csv"; - writeCSVReport(filename, CSVRepMode::Speedup, 3 /* prec */); + if ( haveReferenceVariant() ) { + filename = out_fprefix + "-speedup-" + RunParams::CombinerOptToStr(combiner) + ".csv"; + writeCSVReport(filename, CSVRepMode::Speedup, combiner, 3 /* prec */); + } } filename = out_fprefix + "-checksum.txt"; @@ -817,7 +854,7 @@ void Executor::outputRunData() void Executor::writeCSVReport(const string& filename, CSVRepMode mode, - size_t prec) + RunParams::CombinerOpt combiner, size_t prec) { ofstream file(filename.c_str(), ios::out | ios::trunc); if ( !file ) { @@ -847,6 +884,7 @@ void Executor::writeCSVReport(const string& filename, CSVRepMode mode, // Print title line. // file << getReportTitle(mode); + file << RunParams::CombinerOptToStr(combiner); // // Wrtie CSV file contents for report. @@ -885,7 +923,7 @@ void Executor::writeCSVReport(const string& filename, CSVRepMode mode, file << "Not run"; } else { file << setprecision(prec) << std::fixed - << getReportDataEntry(mode, kern, vid); + << getReportDataEntry(mode, combiner, kern, vid); } } file << endl; @@ -1251,20 +1289,49 @@ string Executor::getReportTitle(CSVRepMode mode) } long double Executor::getReportDataEntry(CSVRepMode mode, + RunParams::CombinerOpt combiner, KernelBase* kern, VariantID vid) { long double retval = 0.0; switch ( mode ) { case CSVRepMode::Timing : { - retval = kern->getTotTime(vid) / run_params.getNumPasses(); + switch ( combiner ) { + case RunParams::CombinerOpt::Average : { + retval = kern->getTotTime(vid) / run_params.getNumPasses(); + } + break; + case RunParams::CombinerOpt::Minimum : { + retval = kern->getMinTime(vid); + } + break; + case RunParams::CombinerOpt::Maximum : { + retval = kern->getMaxTime(vid); + } + break; + default : { cout << "\n Unknown CSV combiner mode = " << combiner << endl; } + } break; } case CSVRepMode::Speedup : { if ( haveReferenceVariant() ) { if ( kern->hasVariantDefined(reference_vid) && kern->hasVariantDefined(vid) ) { - retval = kern->getTotTime(reference_vid) / kern->getTotTime(vid); + switch ( combiner ) { + case RunParams::CombinerOpt::Average : { + retval = kern->getTotTime(reference_vid) / kern->getTotTime(vid); + } + break; + case RunParams::CombinerOpt::Minimum : { + retval = kern->getMinTime(reference_vid) / kern->getMinTime(vid); + } + break; + case RunParams::CombinerOpt::Maximum : { + retval = kern->getMaxTime(reference_vid) / kern->getMaxTime(vid); + } + break; + default : { cout << "\n Unknown CSV combiner mode = " << combiner << endl; } + } } else { retval = 0.0; } diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp index 32e978f9a..c6fbb4830 100644 --- a/src/common/Executor.hpp +++ b/src/common/Executor.hpp @@ -63,9 +63,9 @@ class Executor void writeKernelInfoSummary(std::ostream& str, bool to_file) const; void writeCSVReport(const std::string& filename, CSVRepMode mode, - size_t prec); + RunParams::CombinerOpt combiner, size_t prec); std::string getReportTitle(CSVRepMode mode); - long double getReportDataEntry(CSVRepMode mode, + long double getReportDataEntry(CSVRepMode mode, RunParams::CombinerOpt combiner, KernelBase* kern, VariantID vid); void writeChecksumReport(const std::string& filename); diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index e038863c1..09a8326ff 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -29,6 +29,7 @@ RunParams::RunParams(int argc, char** argv) : input_state(Undefined), show_progress(false), npasses(1), + npasses_combiners(), rep_fact(1.0), size_meaning(SizeMeaning::Unset), size(0.0), @@ -48,6 +49,8 @@ RunParams::RunParams(int argc, char** argv) invalid_feature_input(), exclude_feature_input(), invalid_exclude_feature_input(), + npasses_combiner_input(), + invalid_npasses_combiner_input(), outdir(), outfile_prefix("RAJAPerf") { @@ -78,6 +81,18 @@ void RunParams::print(std::ostream& str) const { str << "\n show_progress = " << show_progress; str << "\n npasses = " << npasses; + str << "\n npasses combiners = "; + for (size_t j = 0; j < npasses_combiners.size(); ++j) { + str << "\n\t" << CombinerOptToStr(npasses_combiners[j]); + } + str << "\n npasses_combiners_input = "; + for (size_t j = 0; j < npasses_combiner_input.size(); ++j) { + str << "\n\t" << npasses_combiner_input[j]; + } + str << "\n invalid_npasses_combiners_input = "; + for (size_t j = 0; j < invalid_npasses_combiner_input.size(); ++j) { + str << "\n\t" << invalid_npasses_combiner_input[j]; + } str << "\n rep_fact = " << rep_fact; str << "\n size_meaning = " << SizeMeaningToStr(getSizeMeaning()); str << "\n size = " << size; @@ -215,6 +230,21 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) input_state = BadInput; } + } else if ( opt == std::string("--npasses-combiners") ) { + + bool done = false; + i++; + while ( i < argc && !done ) { + opt = std::string(argv[i]); + if ( opt.at(0) == '-' ) { + i--; + done = true; + } else { + npasses_combiner_input.push_back(opt); + ++i; + } + } + } else if ( opt == std::string("--repfact") ) { i++; @@ -467,6 +497,11 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) size_meaning = SizeMeaning::Factor; size_factor = 1.0; } + + // Default npasses_combiners if no input + if (npasses_combiner_input.empty()) { + npasses_combiners.emplace_back(CombinerOpt::Average); + } } @@ -494,7 +529,13 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t --npasses [default is 1]\n" << "\t (num passes through Suite)\n"; str << "\t\t Example...\n" - << "\t\t --npasses 2 (runs complete Suite twice\n\n"; + << "\t\t --npasses 2 (runs complete Suite twice)\n\n"; + + str << "\t --npasses-combiners [Default is average]\n" + << "\t (Ways of combining npasses timing data into timing files)\n"; + str << "\t\t Example...\n" + << "\t\t --npasses-combiners Average Minimum Maximum (produce average, min, and\n" + << "\t\t max timing .csv files)\n\n"; str << "\t --repfact [default is 1.0]\n" << "\t (multiplier on default # reps to run each kernel)\n"; diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index c25e58342..0ea2296ad 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -45,6 +45,29 @@ class RunParams { Undefined /*!< input not defined (yet) */ }; + /*! + * \brief Enumeration indicating state of combiner options requested + */ + enum CombinerOpt { + Average, /*!< option requesting average */ + Minimum, /*!< option requesting minimum */ + Maximum /*!< option requesting maximum */ + }; + + static std::string CombinerOptToStr(CombinerOpt co) + { + switch (co) { + case CombinerOpt::Average: + return "Average"; + case CombinerOpt::Minimum: + return "Minimum"; + case CombinerOpt::Maximum: + return "Maximum"; + default: + return "Unknown"; + } + } + /*! * \brief Enumeration indicating how to interpret size input */ @@ -90,6 +113,11 @@ class RunParams { double getRepFactor() const { return rep_fact; } + const std::vector& getNpassesCombinerOpts() const + { return npasses_combiners; } + void setNpassesCombinerOpts( std::vector& cvec ) + { npasses_combiners = cvec; } + SizeMeaning getSizeMeaning() const { return size_meaning; } @@ -145,6 +173,13 @@ class RunParams { const std::vector& getInvalidExcludeFeatureInput() const { return invalid_exclude_feature_input; } + const std::vector& getNpassesCombinerOptInput() const + { return npasses_combiner_input; } + const std::vector& getInvalidNpassesCombinerOptInput() const + { return invalid_npasses_combiner_input; } + void setInvalidNpassesCombinerOptInput( std::vector& svec ) + { invalid_npasses_combiner_input = svec; } + const std::string& getOutputDirName() const { return outdir; } const std::string& getOutputFilePrefix() const { return outfile_prefix; } @@ -178,6 +213,9 @@ class RunParams { int npasses; /*!< Number of passes through suite */ + std::vector npasses_combiners; /*!< Combiners to use when + outputting timer data */ + double rep_fact; /*!< pct of default kernel reps to run */ SizeMeaning size_meaning; /*!< meaning of size value */ @@ -209,6 +247,9 @@ class RunParams { std::vector exclude_feature_input; std::vector invalid_exclude_feature_input; + std::vector npasses_combiner_input; + std::vector invalid_npasses_combiner_input; + std::string outdir; /*!< Output directory name. */ std::string outfile_prefix; /*!< Prefix for output data file names. */ From 93020891b53c528977dee2b1fa1e37d155fcce03 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 2 Nov 2021 12:28:42 -0700 Subject: [PATCH 160/392] Fix cvs report titles --- src/common/Executor.cpp | 28 +++++++++++++++++++++------- src/common/Executor.hpp | 2 +- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 71907794b..ca2ea7c0c 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -883,8 +883,7 @@ void Executor::writeCSVReport(const string& filename, CSVRepMode mode, // // Print title line. // - file << getReportTitle(mode); - file << RunParams::CombinerOptToStr(combiner); + file << getReportTitle(mode, combiner); // // Wrtie CSV file contents for report. @@ -1267,19 +1266,34 @@ void Executor::writeChecksumReport(const string& filename) } -string Executor::getReportTitle(CSVRepMode mode) +string Executor::getReportTitle(CSVRepMode mode, RunParams::CombinerOpt combiner) { string title; + switch ( combiner ) { + case RunParams::CombinerOpt::Average : { + title = string("Mean "); + } + break; + case RunParams::CombinerOpt::Minimum : { + title = string("Min "); + } + break; + case RunParams::CombinerOpt::Maximum : { + title = string("Max "); + } + break; + default : { cout << "\n Unknown CSV combiner mode = " << combiner << endl; } + } switch ( mode ) { case CSVRepMode::Timing : { - title = string("Mean Runtime Report (sec.) "); + title += string("Runtime Report (sec.) "); break; } case CSVRepMode::Speedup : { if ( haveReferenceVariant() ) { - title = string("Speedup Report (T_ref/T_var)") + - string(": ref var = ") + getVariantName(reference_vid) + - string(" "); + title += string("Speedup Report (T_ref/T_var)") + + string(": ref var = ") + getVariantName(reference_vid) + + string(" "); } break; } diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp index c6fbb4830..6a59b5ca2 100644 --- a/src/common/Executor.hpp +++ b/src/common/Executor.hpp @@ -64,7 +64,7 @@ class Executor void writeCSVReport(const std::string& filename, CSVRepMode mode, RunParams::CombinerOpt combiner, size_t prec); - std::string getReportTitle(CSVRepMode mode); + std::string getReportTitle(CSVRepMode mode, RunParams::CombinerOpt combiner); long double getReportDataEntry(CSVRepMode mode, RunParams::CombinerOpt combiner, KernelBase* kern, VariantID vid); From d836b88ca66db04cc4fe97b20f1e34f78e2ae4d4 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 2 Nov 2021 17:32:08 -0700 Subject: [PATCH 161/392] Add Basic MPI support --- CMakeLists.txt | 17 ++++++++++++++--- src/RAJAPerfSuiteDriver.cpp | 12 ++++++++++++ src/common/RAJAPerfSuite.hpp | 1 + src/rajaperf_config.hpp.in | 13 ++++++++----- 4 files changed, 35 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4fc8c256a..c50ef5c38 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ ############################################################################### # Copyright (c) 2017-21, Lawrence Livermore National Security, LLC -# and RAJA Performance Suite project contributors. +# and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) @@ -30,6 +30,12 @@ set(BLT_CXX_STANDARD 11) include(blt/SetupBLT.cmake) +# +# Define RAJA PERFSUITE settings... +# + +cmake_dependent_option(RAJA_PERFSUITE_ENABLE_MPI "Build with MPI" On "ENABLE_MPI" Off) + # # Define RAJA settings... # @@ -75,12 +81,15 @@ set(RAJA_PERFSUITE_VERSION_PATCHLEVEL 0) set(RAJA_PERFSUITE_DEPENDS RAJA) +if (ENABLE_MPI) + list(APPEND RAJA_PERFSUITE_DEPENDS mpi) +endif() if (ENABLE_OPENMP) list(APPEND RAJA_PERFSUITE_DEPENDS openmp) endif() if (ENABLE_CUDA) list(APPEND RAJA_PERFSUITE_DEPENDS cuda) -endif() +endif() if (ENABLE_HIP) list(APPEND RAJA_PERFSUITE_DEPENDS hip) endif() @@ -109,7 +118,9 @@ endif() configure_file(${CMAKE_SOURCE_DIR}/src/rajaperf_config.hpp.in ${CMAKE_CURRENT_BINARY_DIR}/bin/rajaperf_config.hpp) -# Make sure RAJA flag propagate (we need to do some house cleaning to +include_directories($) + +# Make sure RAJA flag propagate (we need to do some house cleaning to # remove project-specific CMake variables that are no longer needed) set (CUDA_NVCC_FLAGS ${RAJA_NVCC_FLAGS}) diff --git a/src/RAJAPerfSuiteDriver.cpp b/src/RAJAPerfSuiteDriver.cpp index c47ecd9f1..da154dbd1 100644 --- a/src/RAJAPerfSuiteDriver.cpp +++ b/src/RAJAPerfSuiteDriver.cpp @@ -10,9 +10,17 @@ #include +#ifdef RAJA_PERFSUITE_ENABLE_MPI +#include +#endif + //------------------------------------------------------------------------------ int main( int argc, char** argv ) { +#ifdef RAJA_PERFSUITE_ENABLE_MPI + MPI_Init(&argc, &argv); +#endif + // STEP 1: Create suite executor object rajaperf::Executor executor(argc, argv); @@ -31,5 +39,9 @@ int main( int argc, char** argv ) std::cout << "\n\nDONE!!!...." << std::endl; +#ifdef RAJA_PERFSUITE_ENABLE_MPI + MPI_Finalize(); +#endif + return 0; } diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index ca4f10f1d..4e4d38a66 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -14,6 +14,7 @@ #define RAJAPerfSuite_HPP #include "RAJA/config.hpp" +#include "rajaperf_config.hpp" #include diff --git a/src/rajaperf_config.hpp.in b/src/rajaperf_config.hpp.in index c34f9120c..808993af4 100644 --- a/src/rajaperf_config.hpp.in +++ b/src/rajaperf_config.hpp.in @@ -24,10 +24,12 @@ #include +#cmakedefine RAJA_PERFSUITE_ENABLE_MPI + namespace rajaperf { struct configuration { - +#if 0 // Version of RAJA Perf Suite (ex: 0.1.0) static const std::string perfsuite_version = "@RAJA_PERFSUITE_VERSION_MAJOR@" + std::string(".") + @@ -35,7 +37,7 @@ static const std::string perfsuite_version = "@RAJA_PERFSUITE_VERSION_PATCHLEVEL@"; // Version of RAJA used to build (ex: 0.2.4) -static const std::string raja_version = +static const std::string raja_version = std::to_string(RAJA::RAJA_VERSION_MAJOR) + std::string(".") + std::to_string(RAJA::RAJA_VERSION_MINOR) + std::string(".") + std::to_string(RAJA::RAJA_VERSION_PATCH_LEVEL); @@ -43,13 +45,14 @@ std::to_string(RAJA::RAJA_VERSION_PATCH_LEVEL); // Systype and machine code was built on (ex: chaos_5_x64_64, rzhasgpu18) static const std::string systype_build = "@RAJAPERF_BUILD_SYSTYPE@"; static const std::string machine_build = "@RAJAPERF_BUILD_HOST@"; - + // Compiler used to build (ex: gcc-4.9.3) static const std::string compiler = "@RAJAPERF_COMPILER@"; // Command options used to build (ex: -Ofast -mavx) static const std::string compiler_options = "@RAJAPERF_COMPILER_OPTIONS@"; - +#endif + // Name of user who ran code std::string user_run; @@ -59,7 +62,7 @@ std::string date_run; // Systype and machine code ran on (ex: chaos_5_x64_64) std::string systype_run; std::string machine_run; - + }; } // closing brace for rajaperf namespace From 530163abfa86210ae3bc9840a49299c6143ee1d7 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 2 Nov 2021 17:32:36 -0700 Subject: [PATCH 162/392] Add mpi example build script --- .../lc-builds/blueos_spectrum_nvcc_clang.sh | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100755 scripts/lc-builds/blueos_spectrum_nvcc_clang.sh diff --git a/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh b/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh new file mode 100755 index 000000000..33b173b31 --- /dev/null +++ b/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [[ $# -lt 4 ]]; then + echo + echo "You must pass 4 arguments to the script (in this order): " + echo " 1) compiler version number for spectrum mpi" + echo " 2) compiler version number for nvcc" + echo " 3) CUDA compute architecture" + echo " 4) compiler version number for clang. " + echo + echo "For example: " + echo " blueos_nvcc_clang.sh rolling-release 10.2.89 sm_70 10.0.1" + exit +fi + +COMP_MPI_VER=$1 +COMP_NVCC_VER=$2 +COMP_ARCH=$3 +COMP_CLANG_VER=$4 +shift 4 + +BUILD_SUFFIX=lc_blueos-spectrum${COMP_MPI_VER}-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-clang${COMP_CLANG_VER} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_clang_X.cmake + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo + +rm -rf build_${BUILD_SUFFIX} >/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + +module load cmake/3.14.5 + +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DMPI_CXX_COMPILER=/usr/tce/packages/spectrum-mpi/spectrum-mpi-${COMP_MPI_VER}-clang-${COMP_CLANG_VER}/bin/mpiclang++ \ + -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_CLANG_VER}/bin/clang++ \ + -DBLT_CXX_STD=c++11 \ + -C ${RAJA_HOSTCONFIG} \ + -DENABLE_MPI=On \ + -DENABLE_OPENMP=On \ + -DENABLE_CUDA=On \ + -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ + -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ + -DCUDA_ARCH=${COMP_ARCH} \ + -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + "$@" \ + .. + +echo +echo "***********************************************************************" +echo +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" +echo +echo " Please note that you have to disable CUDA GPU hooks when you run" +echo " the RAJA Perf Suite; for example," +echo +echo " lrun -n4 ./bin/raja-perf.exe" +echo +echo "***********************************************************************" From faf693cc69292d171fea3a0570de50048df63262 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 2 Nov 2021 17:37:26 -0700 Subject: [PATCH 163/392] Handle screen output Now only rank 0 prints --- src/RAJAPerfSuiteDriver.cpp | 12 +++- src/algorithm/SORT-Cuda.cpp | 2 +- src/algorithm/SORT-Hip.cpp | 2 +- src/algorithm/SORT-OMP.cpp | 4 +- src/algorithm/SORT-Seq.cpp | 2 +- src/algorithm/SORT.hpp | 2 +- src/algorithm/SORTPAIRS-Cuda.cpp | 2 +- src/algorithm/SORTPAIRS-Hip.cpp | 2 +- src/algorithm/SORTPAIRS-OMP.cpp | 4 +- src/algorithm/SORTPAIRS-Seq.cpp | 2 +- src/algorithm/SORTPAIRS.hpp | 2 +- src/apps/AppsData.cpp | 13 +++-- src/apps/DEL_DOT_VEC_2D-Cuda.cpp | 2 +- src/apps/DEL_DOT_VEC_2D-Hip.cpp | 2 +- src/apps/DEL_DOT_VEC_2D-OMP.cpp | 8 +-- src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp | 10 ++-- src/apps/DEL_DOT_VEC_2D-Seq.cpp | 12 ++-- src/apps/DIFFUSION3DPA-Cuda.cpp | 2 +- src/apps/DIFFUSION3DPA-Hip.cpp | 2 +- src/apps/DIFFUSION3DPA-OMP.cpp | 2 +- src/apps/DIFFUSION3DPA-OMPTarget.cpp | 2 +- src/apps/DIFFUSION3DPA-Seq.cpp | 2 +- src/apps/ENERGY-Cuda.cpp | 2 +- src/apps/ENERGY-Hip.cpp | 2 +- src/apps/ENERGY-OMP.cpp | 18 +++--- src/apps/ENERGY-OMPTarget.cpp | 8 +-- src/apps/ENERGY-Seq.cpp | 12 ++-- src/apps/FIR-Cuda.cpp | 2 +- src/apps/FIR-Hip.cpp | 2 +- src/apps/FIR-OMP.cpp | 8 +-- src/apps/FIR-OMPTarget.cpp | 4 +- src/apps/FIR-Seq.cpp | 10 ++-- src/apps/HALOEXCHANGE-Cuda.cpp | 2 +- src/apps/HALOEXCHANGE-Hip.cpp | 2 +- src/apps/HALOEXCHANGE-OMP.cpp | 4 +- src/apps/HALOEXCHANGE-OMPTarget.cpp | 2 +- src/apps/HALOEXCHANGE-Seq.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED-Cuda.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED-Hip.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED-OMP.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED-Seq.cpp | 2 +- src/apps/LTIMES-Cuda.cpp | 12 ++-- src/apps/LTIMES-Hip.cpp | 8 +-- src/apps/LTIMES-OMP.cpp | 16 ++--- src/apps/LTIMES-OMPTarget.cpp | 8 +-- src/apps/LTIMES-Seq.cpp | 18 +++--- src/apps/LTIMES_NOVIEW-Cuda.cpp | 2 +- src/apps/LTIMES_NOVIEW-Hip.cpp | 14 ++--- src/apps/LTIMES_NOVIEW-OMP.cpp | 12 ++-- src/apps/LTIMES_NOVIEW-OMPTarget.cpp | 4 +- src/apps/LTIMES_NOVIEW-Seq.cpp | 12 ++-- src/apps/MASS3DPA-Cuda.cpp | 6 +- src/apps/MASS3DPA-Hip.cpp | 4 +- src/apps/MASS3DPA-OMP.cpp | 10 ++-- src/apps/MASS3DPA-OMPTarget.cpp | 2 +- src/apps/MASS3DPA-Seq.cpp | 4 +- src/apps/PRESSURE-Cuda.cpp | 2 +- src/apps/PRESSURE-Hip.cpp | 2 +- src/apps/PRESSURE-OMP.cpp | 8 +-- src/apps/PRESSURE-OMPTarget.cpp | 12 ++-- src/apps/PRESSURE-Seq.cpp | 10 ++-- src/apps/VOL3D-Cuda.cpp | 2 +- src/apps/VOL3D-Hip.cpp | 2 +- src/apps/VOL3D-OMP.cpp | 8 +-- src/apps/VOL3D-OMPTarget.cpp | 10 ++-- src/apps/VOL3D-Seq.cpp | 8 +-- src/apps/WIP-COUPLE.cpp | 2 +- src/basic/DAXPY-Cuda.cpp | 2 +- src/basic/DAXPY-Hip.cpp | 2 +- src/basic/DAXPY-OMP.cpp | 6 +- src/basic/DAXPY-OMPTarget.cpp | 6 +- src/basic/DAXPY-Seq.cpp | 4 +- src/basic/IF_QUAD-Cuda.cpp | 2 +- src/basic/IF_QUAD-Hip.cpp | 2 +- src/basic/IF_QUAD-OMP.cpp | 6 +- src/basic/IF_QUAD-OMPTarget.cpp | 6 +- src/basic/IF_QUAD-Seq.cpp | 6 +- src/basic/INIT3-Cuda.cpp | 2 +- src/basic/INIT3-Hip.cpp | 2 +- src/basic/INIT3-OMP.cpp | 6 +- src/basic/INIT3-OMPTarget.cpp | 6 +- src/basic/INIT3-Seq.cpp | 6 +- src/basic/INIT_VIEW1D-Cuda.cpp | 2 +- src/basic/INIT_VIEW1D-Hip.cpp | 2 +- src/basic/INIT_VIEW1D-OMP.cpp | 4 +- src/basic/INIT_VIEW1D-OMPTarget.cpp | 8 +-- src/basic/INIT_VIEW1D-Seq.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-Hip.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-OMP.cpp | 6 +- src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp | 4 +- src/basic/INIT_VIEW1D_OFFSET-Seq.cpp | 4 +- src/basic/MAT_MAT_SHARED-Cuda.cpp | 22 +++---- src/basic/MAT_MAT_SHARED-Hip.cpp | 6 +- src/basic/MAT_MAT_SHARED-OMP.cpp | 12 ++-- src/basic/MAT_MAT_SHARED-OMPTarget.cpp | 2 +- src/basic/MAT_MAT_SHARED-Seq.cpp | 10 ++-- src/basic/MULADDSUB-Cuda.cpp | 2 +- src/basic/MULADDSUB-Hip.cpp | 2 +- src/basic/MULADDSUB-OMP.cpp | 6 +- src/basic/MULADDSUB-OMPTarget.cpp | 4 +- src/basic/MULADDSUB-Seq.cpp | 4 +- src/basic/NESTED_INIT-Cuda.cpp | 14 ++--- src/basic/NESTED_INIT-Hip.cpp | 12 ++-- src/basic/NESTED_INIT-OMP.cpp | 10 ++-- src/basic/NESTED_INIT-OMPTarget.cpp | 12 ++-- src/basic/NESTED_INIT-Seq.cpp | 6 +- src/basic/PI_ATOMIC-Cuda.cpp | 2 +- src/basic/PI_ATOMIC-Hip.cpp | 2 +- src/basic/PI_ATOMIC-OMP.cpp | 10 ++-- src/basic/PI_ATOMIC-OMPTarget.cpp | 10 ++-- src/basic/PI_ATOMIC-Seq.cpp | 8 +-- src/basic/PI_REDUCE-Cuda.cpp | 10 ++-- src/basic/PI_REDUCE-Hip.cpp | 8 +-- src/basic/PI_REDUCE-OMP.cpp | 16 ++--- src/basic/PI_REDUCE-OMPTarget.cpp | 14 ++--- src/basic/PI_REDUCE-Seq.cpp | 10 ++-- src/basic/REDUCE3_INT-Cuda.cpp | 2 +- src/basic/REDUCE3_INT-Hip.cpp | 2 +- src/basic/REDUCE3_INT-OMP.cpp | 10 ++-- src/basic/REDUCE3_INT-OMPTarget.cpp | 4 +- src/basic/REDUCE3_INT-Seq.cpp | 4 +- src/basic/TRAP_INT-Cuda.cpp | 2 +- src/basic/TRAP_INT-Hip.cpp | 2 +- src/basic/TRAP_INT-OMP.cpp | 6 +- src/basic/TRAP_INT-OMPTarget.cpp | 10 ++-- src/basic/TRAP_INT-Seq.cpp | 4 +- src/common/DataUtils.cpp | 46 +++++++-------- src/common/Executor.cpp | 40 ++++++------- src/common/KernelBase.cpp | 58 +++++++++---------- src/common/OutputUtils.cpp | 7 ++- src/common/RAJAPerfSuite.cpp | 51 ++++++++++++++-- src/common/RAJAPerfSuite.hpp | 46 ++++++++++++--- src/common/RunParams.cpp | 36 ++++++------ src/lcals/DIFF_PREDICT-Cuda.cpp | 2 +- src/lcals/DIFF_PREDICT-Hip.cpp | 2 +- src/lcals/DIFF_PREDICT-OMP.cpp | 6 +- src/lcals/DIFF_PREDICT-OMPTarget.cpp | 6 +- src/lcals/DIFF_PREDICT-Seq.cpp | 6 +- src/lcals/EOS-Cuda.cpp | 2 +- src/lcals/EOS-Hip.cpp | 2 +- src/lcals/EOS-OMP.cpp | 6 +- src/lcals/EOS-OMPTarget.cpp | 10 ++-- src/lcals/EOS-Seq.cpp | 4 +- src/lcals/FIRST_DIFF-Cuda.cpp | 2 +- src/lcals/FIRST_DIFF-Hip.cpp | 2 +- src/lcals/FIRST_DIFF-OMP.cpp | 6 +- src/lcals/FIRST_DIFF-OMPTarget.cpp | 20 +++---- src/lcals/FIRST_DIFF-Seq.cpp | 4 +- src/lcals/FIRST_MIN-Cuda.cpp | 2 +- src/lcals/FIRST_MIN-Hip.cpp | 2 +- src/lcals/FIRST_MIN-OMP.cpp | 10 ++-- src/lcals/FIRST_MIN-OMPTarget.cpp | 16 ++--- src/lcals/FIRST_MIN-Seq.cpp | 4 +- src/lcals/FIRST_SUM-Cuda.cpp | 2 +- src/lcals/FIRST_SUM-Hip.cpp | 2 +- src/lcals/FIRST_SUM-OMP.cpp | 6 +- src/lcals/FIRST_SUM-OMPTarget.cpp | 20 +++---- src/lcals/FIRST_SUM-Seq.cpp | 4 +- src/lcals/GEN_LIN_RECUR-Cuda.cpp | 2 +- src/lcals/GEN_LIN_RECUR-Hip.cpp | 2 +- src/lcals/GEN_LIN_RECUR-OMP.cpp | 6 +- src/lcals/GEN_LIN_RECUR-OMPTarget.cpp | 8 +-- src/lcals/GEN_LIN_RECUR-Seq.cpp | 4 +- src/lcals/HYDRO_1D-Cuda.cpp | 2 +- src/lcals/HYDRO_1D-Hip.cpp | 2 +- src/lcals/HYDRO_1D-OMP.cpp | 6 +- src/lcals/HYDRO_1D-OMPTarget.cpp | 6 +- src/lcals/HYDRO_1D-Seq.cpp | 4 +- src/lcals/HYDRO_2D-Cuda.cpp | 10 ++-- src/lcals/HYDRO_2D-Hip.cpp | 8 +-- src/lcals/HYDRO_2D-OMP.cpp | 14 ++--- src/lcals/HYDRO_2D-OMPTarget.cpp | 10 ++-- src/lcals/HYDRO_2D-Seq.cpp | 10 ++-- src/lcals/INT_PREDICT-Cuda.cpp | 2 +- src/lcals/INT_PREDICT-Hip.cpp | 2 +- src/lcals/INT_PREDICT-OMP.cpp | 6 +- src/lcals/INT_PREDICT-OMPTarget.cpp | 12 ++-- src/lcals/INT_PREDICT-Seq.cpp | 4 +- src/lcals/PLANCKIAN-Cuda.cpp | 2 +- src/lcals/PLANCKIAN-Hip.cpp | 2 +- src/lcals/PLANCKIAN-OMP.cpp | 6 +- src/lcals/PLANCKIAN-OMPTarget.cpp | 16 ++--- src/lcals/PLANCKIAN-Seq.cpp | 4 +- src/lcals/TRIDIAG_ELIM-Cuda.cpp | 2 +- src/lcals/TRIDIAG_ELIM-Hip.cpp | 2 +- src/lcals/TRIDIAG_ELIM-OMP.cpp | 6 +- src/lcals/TRIDIAG_ELIM-OMPTarget.cpp | 10 ++-- src/lcals/TRIDIAG_ELIM-Seq.cpp | 4 +- src/polybench/POLYBENCH_2MM-Cuda.cpp | 4 +- src/polybench/POLYBENCH_2MM-Hip.cpp | 10 ++-- src/polybench/POLYBENCH_2MM-OMP.cpp | 18 +++--- src/polybench/POLYBENCH_2MM-OMPTarget.cpp | 14 ++--- src/polybench/POLYBENCH_2MM-Seq.cpp | 22 +++---- src/polybench/POLYBENCH_3MM-Cuda.cpp | 2 +- src/polybench/POLYBENCH_3MM-Hip.cpp | 12 ++-- src/polybench/POLYBENCH_3MM-OMP.cpp | 20 +++---- src/polybench/POLYBENCH_3MM-OMPTarget.cpp | 32 +++++----- src/polybench/POLYBENCH_3MM-Seq.cpp | 14 ++--- src/polybench/POLYBENCH_ADI-Cuda.cpp | 2 +- src/polybench/POLYBENCH_ADI-Hip.cpp | 6 +- src/polybench/POLYBENCH_ADI-OMP.cpp | 14 ++--- src/polybench/POLYBENCH_ADI-OMPTarget.cpp | 20 +++---- src/polybench/POLYBENCH_ADI-Seq.cpp | 16 ++--- src/polybench/POLYBENCH_ATAX-Cuda.cpp | 2 +- src/polybench/POLYBENCH_ATAX-Hip.cpp | 6 +- src/polybench/POLYBENCH_ATAX-OMP.cpp | 20 +++---- src/polybench/POLYBENCH_ATAX-OMPTarget.cpp | 8 +-- src/polybench/POLYBENCH_ATAX-Seq.cpp | 18 +++--- src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp | 6 +- src/polybench/POLYBENCH_FDTD_2D-Hip.cpp | 20 +++---- src/polybench/POLYBENCH_FDTD_2D-OMP.cpp | 8 +-- src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp | 8 +-- src/polybench/POLYBENCH_FDTD_2D-Seq.cpp | 12 ++-- .../POLYBENCH_FLOYD_WARSHALL-Cuda.cpp | 8 +-- .../POLYBENCH_FLOYD_WARSHALL-Hip.cpp | 6 +- .../POLYBENCH_FLOYD_WARSHALL-OMP.cpp | 18 +++--- .../POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp | 8 +-- .../POLYBENCH_FLOYD_WARSHALL-Seq.cpp | 20 +++---- src/polybench/POLYBENCH_GEMM-Cuda.cpp | 4 +- src/polybench/POLYBENCH_GEMM-Hip.cpp | 10 ++-- src/polybench/POLYBENCH_GEMM-OMP.cpp | 10 ++-- src/polybench/POLYBENCH_GEMM-OMPTarget.cpp | 12 ++-- src/polybench/POLYBENCH_GEMM-Seq.cpp | 10 ++-- src/polybench/POLYBENCH_GEMVER-Cuda.cpp | 4 +- src/polybench/POLYBENCH_GEMVER-Hip.cpp | 14 ++--- src/polybench/POLYBENCH_GEMVER-OMP.cpp | 14 ++--- src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp | 14 ++--- src/polybench/POLYBENCH_GEMVER-Seq.cpp | 28 ++++----- src/polybench/POLYBENCH_GESUMMV-Cuda.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV-Hip.cpp | 4 +- src/polybench/POLYBENCH_GESUMMV-OMP.cpp | 10 ++-- src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp | 8 +-- src/polybench/POLYBENCH_GESUMMV-Seq.cpp | 12 ++-- src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D-Hip.cpp | 6 +- src/polybench/POLYBENCH_HEAT_3D-OMP.cpp | 10 ++-- src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp | 6 +- src/polybench/POLYBENCH_HEAT_3D-Seq.cpp | 22 +++---- src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp | 8 +-- .../POLYBENCH_JACOBI_1D-OMPTarget.cpp | 10 ++-- src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp | 14 ++--- src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp | 4 +- src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp | 6 +- src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp | 18 +++--- .../POLYBENCH_JACOBI_2D-OMPTarget.cpp | 16 ++--- src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp | 14 ++--- src/polybench/POLYBENCH_MVT-Cuda.cpp | 2 +- src/polybench/POLYBENCH_MVT-Hip.cpp | 2 +- src/polybench/POLYBENCH_MVT-OMP.cpp | 24 ++++---- src/polybench/POLYBENCH_MVT-OMPTarget.cpp | 8 +-- src/polybench/POLYBENCH_MVT-Seq.cpp | 34 +++++------ src/stream/ADD-Cuda.cpp | 2 +- src/stream/ADD-Hip.cpp | 2 +- src/stream/ADD-OMP.cpp | 10 ++-- src/stream/ADD-OMPTarget.cpp | 2 +- src/stream/ADD-Seq.cpp | 8 +-- src/stream/COPY-Cuda.cpp | 2 +- src/stream/COPY-Hip.cpp | 2 +- src/stream/COPY-OMP.cpp | 6 +- src/stream/COPY-OMPTarget.cpp | 4 +- src/stream/COPY-Seq.cpp | 4 +- src/stream/DOT-Cuda.cpp | 2 +- src/stream/DOT-Hip.cpp | 2 +- src/stream/DOT-OMP.cpp | 6 +- src/stream/DOT-OMPTarget.cpp | 4 +- src/stream/DOT-Seq.cpp | 4 +- src/stream/MUL-Cuda.cpp | 2 +- src/stream/MUL-Hip.cpp | 2 +- src/stream/MUL-OMP.cpp | 6 +- src/stream/MUL-OMPTarget.cpp | 4 +- src/stream/MUL-Seq.cpp | 4 +- src/stream/TRIAD-Cuda.cpp | 2 +- src/stream/TRIAD-Hip.cpp | 2 +- src/stream/TRIAD-OMP.cpp | 6 +- src/stream/TRIAD-OMPTarget.cpp | 2 +- src/stream/TRIAD-Seq.cpp | 4 +- 280 files changed, 1129 insertions(+), 1052 deletions(-) diff --git a/src/RAJAPerfSuiteDriver.cpp b/src/RAJAPerfSuiteDriver.cpp index da154dbd1..70cbd604c 100644 --- a/src/RAJAPerfSuiteDriver.cpp +++ b/src/RAJAPerfSuiteDriver.cpp @@ -21,15 +21,21 @@ int main( int argc, char** argv ) MPI_Init(&argc, &argv); #endif +#ifdef RAJA_PERFSUITE_ENABLE_MPI + int num_ranks; + MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); + rajaperf::getCout() << "\n\nRunning with " << num_ranks << " MPI ranks..." << std::endl; +#endif + // STEP 1: Create suite executor object rajaperf::Executor executor(argc, argv); // STEP 2: Assemble kernels and variants to run executor.setupSuite(); - // STEP 3: Report suite run summary + // STEP 3: Report suite run summary // (enable users to catch errors before entire suite is run) - executor.reportRunSummary(std::cout); + executor.reportRunSummary(rajaperf::getCout()); // STEP 4: Execute suite executor.runSuite(); @@ -37,7 +43,7 @@ int main( int argc, char** argv ) // STEP 5: Generate suite execution reports executor.outputRunData(); - std::cout << "\n\nDONE!!!...." << std::endl; + rajaperf::getCout() << "\n\nDONE!!!...." << std::endl; #ifdef RAJA_PERFSUITE_ENABLE_MPI MPI_Finalize(); diff --git a/src/algorithm/SORT-Cuda.cpp b/src/algorithm/SORT-Cuda.cpp index 8bdd212de..5870e08b5 100644 --- a/src/algorithm/SORT-Cuda.cpp +++ b/src/algorithm/SORT-Cuda.cpp @@ -58,7 +58,7 @@ void SORT::runCudaVariant(VariantID vid) SORT_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n SORT : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n SORT : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/algorithm/SORT-Hip.cpp b/src/algorithm/SORT-Hip.cpp index c551aeac6..edf143cbc 100644 --- a/src/algorithm/SORT-Hip.cpp +++ b/src/algorithm/SORT-Hip.cpp @@ -58,7 +58,7 @@ void SORT::runHipVariant(VariantID vid) SORT_DATA_TEARDOWN_HIP; } else { - std::cout << "\n SORT : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n SORT : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/algorithm/SORT-OMP.cpp b/src/algorithm/SORT-OMP.cpp index 534c1edd3..a83d956ed 100644 --- a/src/algorithm/SORT-OMP.cpp +++ b/src/algorithm/SORT-OMP.cpp @@ -44,12 +44,12 @@ void SORT::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n SORT : Unknown variant id = " << vid << std::endl; + getCout() << "\n SORT : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/algorithm/SORT-Seq.cpp b/src/algorithm/SORT-Seq.cpp index e6dcc48a4..4f7094ba6 100644 --- a/src/algorithm/SORT-Seq.cpp +++ b/src/algorithm/SORT-Seq.cpp @@ -57,7 +57,7 @@ void SORT::runSeqVariant(VariantID vid) #endif default : { - std::cout << "\n SORT : Unknown variant id = " << vid << std::endl; + getCout() << "\n SORT : Unknown variant id = " << vid << std::endl; } } diff --git a/src/algorithm/SORT.hpp b/src/algorithm/SORT.hpp index f576bee97..b266f0d9e 100644 --- a/src/algorithm/SORT.hpp +++ b/src/algorithm/SORT.hpp @@ -52,7 +52,7 @@ class SORT : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid) { - std::cout << "\n SORT : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n SORT : Unknown OMP Target variant id = " << vid << std::endl; } private: diff --git a/src/algorithm/SORTPAIRS-Cuda.cpp b/src/algorithm/SORTPAIRS-Cuda.cpp index 7402e880a..aba1111dc 100644 --- a/src/algorithm/SORTPAIRS-Cuda.cpp +++ b/src/algorithm/SORTPAIRS-Cuda.cpp @@ -61,7 +61,7 @@ void SORTPAIRS::runCudaVariant(VariantID vid) SORTPAIRS_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n SORTPAIRS : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n SORTPAIRS : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/algorithm/SORTPAIRS-Hip.cpp b/src/algorithm/SORTPAIRS-Hip.cpp index c1a2a54c4..0850ce650 100644 --- a/src/algorithm/SORTPAIRS-Hip.cpp +++ b/src/algorithm/SORTPAIRS-Hip.cpp @@ -61,7 +61,7 @@ void SORTPAIRS::runHipVariant(VariantID vid) SORTPAIRS_DATA_TEARDOWN_HIP; } else { - std::cout << "\n SORTPAIRS : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n SORTPAIRS : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/algorithm/SORTPAIRS-OMP.cpp b/src/algorithm/SORTPAIRS-OMP.cpp index 8cb5a90dc..99a432931 100644 --- a/src/algorithm/SORTPAIRS-OMP.cpp +++ b/src/algorithm/SORTPAIRS-OMP.cpp @@ -44,12 +44,12 @@ void SORTPAIRS::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n SORTPAIRS : Unknown variant id = " << vid << std::endl; + getCout() << "\n SORTPAIRS : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/algorithm/SORTPAIRS-Seq.cpp b/src/algorithm/SORTPAIRS-Seq.cpp index 6131c9649..f1f9928b6 100644 --- a/src/algorithm/SORTPAIRS-Seq.cpp +++ b/src/algorithm/SORTPAIRS-Seq.cpp @@ -77,7 +77,7 @@ void SORTPAIRS::runSeqVariant(VariantID vid) #endif default : { - std::cout << "\n SORTPAIRS : Unknown variant id = " << vid << std::endl; + getCout() << "\n SORTPAIRS : Unknown variant id = " << vid << std::endl; } } diff --git a/src/algorithm/SORTPAIRS.hpp b/src/algorithm/SORTPAIRS.hpp index b6f03005f..348b548bd 100644 --- a/src/algorithm/SORTPAIRS.hpp +++ b/src/algorithm/SORTPAIRS.hpp @@ -51,7 +51,7 @@ class SORTPAIRS : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid) { - std::cout << "\n SORTPAIRS : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n SORTPAIRS : Unknown OMP Target variant id = " << vid << std::endl; } private: diff --git a/src/apps/AppsData.cpp b/src/apps/AppsData.cpp index 52231d8b9..26ccdb7d1 100644 --- a/src/apps/AppsData.cpp +++ b/src/apps/AppsData.cpp @@ -6,6 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +#include "common/RAJAPerfSuite.hpp" #include "AppsData.hpp" #include @@ -23,7 +24,7 @@ void setMeshPositions_2d(Real_ptr x, Real_type dx, const ADomain& domain) { if (domain.ndims != 2) { - std::cout << "\n******* ERROR!!! domain is not 2d *******" << std::endl; + getCout() << "\n******* ERROR!!! domain is not 2d *******" << std::endl; return; } @@ -34,8 +35,8 @@ void setMeshPositions_2d(Real_ptr x, Real_type dx, Index_type jp = domain.jp; - Index_type npnl = domain.NPNL; - Index_type npnr = domain.NPNR; + Index_type npnl = domain.NPNL; + Index_type npnr = domain.NPNR; Real_ptr x1, x2, x3, x4; Real_ptr y1, y2, y3, y4; @@ -66,7 +67,7 @@ void setMeshPositions_3d(Real_ptr x, Real_type dx, const ADomain& domain) { if (domain.ndims != 3) { - std::cout << "\n******* ERROR!!! domain is not 3d *******" << std::endl; + getCout() << "\n******* ERROR!!! domain is not 3d *******" << std::endl; return; } @@ -80,8 +81,8 @@ void setMeshPositions_3d(Real_ptr x, Real_type dx, Index_type jp = domain.jp; Index_type kp = domain.kp; - Index_type npnl = domain.NPNL; - Index_type npnr = domain.NPNR; + Index_type npnl = domain.NPNL; + Index_type npnr = domain.NPNR; Real_ptr x0, x1, x2, x3, x4, x5, x6, x7; Real_ptr y0, y1, y2, y3, y4, y5, y6, y7; diff --git a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp index 82973db44..894b03bdb 100644 --- a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp @@ -161,7 +161,7 @@ void DEL_DOT_VEC_2D::runCudaVariant(VariantID vid) DEL_DOT_VEC_2D_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n DEL_DOT_VEC_2D : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n DEL_DOT_VEC_2D : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/apps/DEL_DOT_VEC_2D-Hip.cpp b/src/apps/DEL_DOT_VEC_2D-Hip.cpp index ba97858c8..25a290abf 100644 --- a/src/apps/DEL_DOT_VEC_2D-Hip.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Hip.cpp @@ -163,7 +163,7 @@ void DEL_DOT_VEC_2D::runHipVariant(VariantID vid) DEL_DOT_VEC_2D_DATA_TEARDOWN_HIP; } else { - std::cout << "\n DEL_DOT_VEC_2D : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n DEL_DOT_VEC_2D : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/apps/DEL_DOT_VEC_2D-OMP.cpp b/src/apps/DEL_DOT_VEC_2D-OMP.cpp index 75e640459..4232646ad 100644 --- a/src/apps/DEL_DOT_VEC_2D-OMP.cpp +++ b/src/apps/DEL_DOT_VEC_2D-OMP.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace apps { @@ -44,7 +44,7 @@ void DEL_DOT_VEC_2D::runOpenMPVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - #pragma omp parallel for + #pragma omp parallel for for (Index_type ii = ibegin ; ii < iend ; ++ii ) { DEL_DOT_VEC_2D_BODY_INDEX; DEL_DOT_VEC_2D_BODY; @@ -100,12 +100,12 @@ void DEL_DOT_VEC_2D::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n DEL_DOT_VEC_2D : Unknown variant id = " << vid << std::endl; + getCout() << "\n DEL_DOT_VEC_2D : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp index b9196680d..9be35bbc6 100644 --- a/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp +++ b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp @@ -20,7 +20,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace apps { @@ -62,7 +62,7 @@ void DEL_DOT_VEC_2D::runOpenMPTargetVariant(VariantID vid) if ( vid == Base_OpenMPTarget ) { DEL_DOT_VEC_2D_DATA_SETUP_OMP_TARGET; - + NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ; NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ; NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; @@ -74,7 +74,7 @@ void DEL_DOT_VEC_2D::runOpenMPTargetVariant(VariantID vid) #pragma omp target is_device_ptr(x1,x2,x3,x4, y1,y2,y3,y4, \ fx1,fx2,fx3,fx4, fy1,fy2,fy3,fy4, \ div, real_zones) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) for (Index_type ii = ibegin ; ii < iend ; ++ii ) { DEL_DOT_VEC_2D_BODY_INDEX; DEL_DOT_VEC_2D_BODY; @@ -88,7 +88,7 @@ void DEL_DOT_VEC_2D::runOpenMPTargetVariant(VariantID vid) } else if ( vid == RAJA_OpenMPTarget ) { DEL_DOT_VEC_2D_DATA_SETUP_OMP_TARGET; - + NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ; NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ; NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; @@ -114,7 +114,7 @@ void DEL_DOT_VEC_2D::runOpenMPTargetVariant(VariantID vid) DEL_DOT_VEC_2D_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n DEL_DOT_VEC_2D : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n DEL_DOT_VEC_2D : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/apps/DEL_DOT_VEC_2D-Seq.cpp b/src/apps/DEL_DOT_VEC_2D-Seq.cpp index bd6b0884b..208add00b 100644 --- a/src/apps/DEL_DOT_VEC_2D-Seq.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Seq.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace apps { @@ -51,7 +51,7 @@ void DEL_DOT_VEC_2D::runSeqVariant(VariantID vid) stopTimer(); break; - } + } #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { @@ -77,8 +77,8 @@ void DEL_DOT_VEC_2D::runSeqVariant(VariantID vid) case RAJA_Seq : { camp::resources::Resource working_res{camp::resources::Host()}; - RAJA::TypedListSegment zones(m_domain->real_zones, - m_domain->n_real_zones, + RAJA::TypedListSegment zones(m_domain->real_zones, + m_domain->n_real_zones, working_res); auto deldotvec2d_lam = [=](Index_type i) { @@ -91,14 +91,14 @@ void DEL_DOT_VEC_2D::runSeqVariant(VariantID vid) RAJA::forall(zones, deldotvec2d_lam); } - stopTimer(); + stopTimer(); break; } #endif // RUN_RAJA_SEQ default : { - std::cout << "\n DEL_DOT_VEC_2D : Unknown variant id = " << vid << std::endl; + getCout() << "\n DEL_DOT_VEC_2D : Unknown variant id = " << vid << std::endl; } } diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index 61b0d0798..e51e77636 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -273,7 +273,7 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { default: { - std::cout << "\n DIFFUSION3DPA : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n DIFFUSION3DPA : Unknown Cuda variant id = " << vid << std::endl; break; } } diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index 81bc7e323..fc4e2183b 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -276,7 +276,7 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { default: { - std::cout << "\n DIFFUSION3DPA : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n DIFFUSION3DPA : Unknown Hip variant id = " << vid << std::endl; break; } } diff --git a/src/apps/DIFFUSION3DPA-OMP.cpp b/src/apps/DIFFUSION3DPA-OMP.cpp index 1d1b42cc2..05eec0eb6 100644 --- a/src/apps/DIFFUSION3DPA-OMP.cpp +++ b/src/apps/DIFFUSION3DPA-OMP.cpp @@ -247,7 +247,7 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { } default: - std::cout << "\n DIFFUSION3DPA : Unknown OpenMP variant id = " << vid + getCout() << "\n DIFFUSION3DPA : Unknown OpenMP variant id = " << vid << std::endl; } diff --git a/src/apps/DIFFUSION3DPA-OMPTarget.cpp b/src/apps/DIFFUSION3DPA-OMPTarget.cpp index 26ae3bc84..862699345 100644 --- a/src/apps/DIFFUSION3DPA-OMPTarget.cpp +++ b/src/apps/DIFFUSION3DPA-OMPTarget.cpp @@ -27,7 +27,7 @@ void DIFFUSION3DPA::runOpenMPTargetVariant(VariantID vid) { default: { - std::cout << "\n DIFFUSION3DPA : Unknown OpenMPTarget variant id = " << vid << std::endl; + getCout() << "\n DIFFUSION3DPA : Unknown OpenMPTarget variant id = " << vid << std::endl; break; } } diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index 396c6bbc2..c040bfc04 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -250,7 +250,7 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { #endif // RUN_RAJA_SEQ default: - std::cout << "\n DIFFUSION3DPA : Unknown Seq variant id = " << vid + getCout() << "\n DIFFUSION3DPA : Unknown Seq variant id = " << vid << std::endl; } } diff --git a/src/apps/ENERGY-Cuda.cpp b/src/apps/ENERGY-Cuda.cpp index e8a97b99c..e3ccd6135 100644 --- a/src/apps/ENERGY-Cuda.cpp +++ b/src/apps/ENERGY-Cuda.cpp @@ -257,7 +257,7 @@ void ENERGY::runCudaVariant(VariantID vid) ENERGY_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n ENERGY : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n ENERGY : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/apps/ENERGY-Hip.cpp b/src/apps/ENERGY-Hip.cpp index 96a1c759a..560ae6418 100644 --- a/src/apps/ENERGY-Hip.cpp +++ b/src/apps/ENERGY-Hip.cpp @@ -251,7 +251,7 @@ void ENERGY::runHipVariant(VariantID vid) ENERGY_DATA_TEARDOWN_HIP; } else { - std::cout << "\n ENERGY : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n ENERGY : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/apps/ENERGY-OMP.cpp b/src/apps/ENERGY-OMP.cpp index 60c907822..531da1e18 100644 --- a/src/apps/ENERGY-OMP.cpp +++ b/src/apps/ENERGY-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace apps { @@ -27,7 +27,7 @@ void ENERGY::runOpenMPVariant(VariantID vid) const Index_type iend = getActualProblemSize(); ENERGY_DATA_SETUP; - + auto energy_lam1 = [=](Index_type i) { ENERGY_BODY1; }; @@ -93,7 +93,7 @@ void ENERGY::runOpenMPVariant(VariantID vid) break; } - + case Lambda_OpenMP : { startTimer(); @@ -154,16 +154,16 @@ void ENERGY::runOpenMPVariant(VariantID vid) RAJA::forall< RAJA::omp_for_nowait_static_exec< > >( RAJA::RangeSegment(ibegin, iend), energy_lam3); - + RAJA::forall< RAJA::omp_for_nowait_static_exec< > >( RAJA::RangeSegment(ibegin, iend), energy_lam4); - + RAJA::forall< RAJA::omp_for_nowait_static_exec< > >( RAJA::RangeSegment(ibegin, iend), energy_lam5); - + RAJA::forall< RAJA::omp_for_nowait_static_exec< > >( RAJA::RangeSegment(ibegin, iend), energy_lam6); - + }); // end omp parallel region } @@ -172,12 +172,12 @@ void ENERGY::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n ENERGY : Unknown variant id = " << vid << std::endl; + getCout() << "\n ENERGY : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/apps/ENERGY-OMPTarget.cpp b/src/apps/ENERGY-OMPTarget.cpp index 7478e7dd8..a9b709ddd 100644 --- a/src/apps/ENERGY-OMPTarget.cpp +++ b/src/apps/ENERGY-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace apps { @@ -123,7 +123,7 @@ void ENERGY::runOpenMPTargetVariant(VariantID vid) for (Index_type i = ibegin; i < iend; ++i ) { ENERGY_BODY6; } - + } stopTimer(); @@ -157,7 +157,7 @@ void ENERGY::runOpenMPTargetVariant(VariantID vid) RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { ENERGY_BODY4; }); - + RAJA::forall>( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { ENERGY_BODY5; @@ -176,7 +176,7 @@ void ENERGY::runOpenMPTargetVariant(VariantID vid) ENERGY_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n ENERGY : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n ENERGY : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/apps/ENERGY-Seq.cpp b/src/apps/ENERGY-Seq.cpp index c6b69d0af..7f13c9805 100644 --- a/src/apps/ENERGY-Seq.cpp +++ b/src/apps/ENERGY-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace apps { @@ -25,7 +25,7 @@ void ENERGY::runSeqVariant(VariantID vid) const Index_type iend = getActualProblemSize(); ENERGY_DATA_SETUP; - + auto energy_lam1 = [=](Index_type i) { ENERGY_BODY1; }; @@ -67,7 +67,7 @@ void ENERGY::runSeqVariant(VariantID vid) for (Index_type i = ibegin; i < iend; ++i ) { ENERGY_BODY4; } - + for (Index_type i = ibegin; i < iend; ++i ) { ENERGY_BODY5; } @@ -80,7 +80,7 @@ void ENERGY::runSeqVariant(VariantID vid) stopTimer(); break; - } + } #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { @@ -146,14 +146,14 @@ void ENERGY::runSeqVariant(VariantID vid) }); // end sequential region (for single-source code) } - stopTimer(); + stopTimer(); break; } #endif // RUN_RAJA_SEQ default : { - std::cout << "\n ENERGY : Unknown variant id = " << vid << std::endl; + getCout() << "\n ENERGY : Unknown variant id = " << vid << std::endl; } } diff --git a/src/apps/FIR-Cuda.cpp b/src/apps/FIR-Cuda.cpp index baff493ac..c6a8da156 100644 --- a/src/apps/FIR-Cuda.cpp +++ b/src/apps/FIR-Cuda.cpp @@ -144,7 +144,7 @@ void FIR::runCudaVariant(VariantID vid) FIR_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n FIR : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n FIR : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/apps/FIR-Hip.cpp b/src/apps/FIR-Hip.cpp index 4408a714c..fc9c7bb94 100644 --- a/src/apps/FIR-Hip.cpp +++ b/src/apps/FIR-Hip.cpp @@ -144,7 +144,7 @@ void FIR::runHipVariant(VariantID vid) FIR_DATA_TEARDOWN_HIP; } else { - std::cout << "\n FIR : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n FIR : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/apps/FIR-OMP.cpp b/src/apps/FIR-OMP.cpp index 10928fe76..8f011b920 100644 --- a/src/apps/FIR-OMP.cpp +++ b/src/apps/FIR-OMP.cpp @@ -13,7 +13,7 @@ #include #include -namespace rajaperf +namespace rajaperf { namespace apps { @@ -37,7 +37,7 @@ void FIR::runOpenMPVariant(VariantID vid) auto fir_lam = [=](Index_type i) { FIR_BODY; }; - + switch ( vid ) { case Base_OpenMP : { @@ -87,12 +87,12 @@ void FIR::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n FIR : Unknown variant id = " << vid << std::endl; + getCout() << "\n FIR : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/apps/FIR-OMPTarget.cpp b/src/apps/FIR-OMPTarget.cpp index 3103cfd67..0306d8378 100644 --- a/src/apps/FIR-OMPTarget.cpp +++ b/src/apps/FIR-OMPTarget.cpp @@ -17,7 +17,7 @@ #include #include -namespace rajaperf +namespace rajaperf { namespace apps { @@ -94,7 +94,7 @@ void FIR::runOpenMPTargetVariant(VariantID vid) FIR_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n FIR : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n FIR : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/apps/FIR-Seq.cpp b/src/apps/FIR-Seq.cpp index cd10b7069..27d2789ad 100644 --- a/src/apps/FIR-Seq.cpp +++ b/src/apps/FIR-Seq.cpp @@ -13,7 +13,7 @@ #include #include -namespace rajaperf +namespace rajaperf { namespace apps { @@ -35,7 +35,7 @@ void FIR::runSeqVariant(VariantID vid) auto fir_lam = [=](Index_type i) { FIR_BODY; }; - + switch ( vid ) { case Base_Seq : { @@ -51,7 +51,7 @@ void FIR::runSeqVariant(VariantID vid) stopTimer(); break; - } + } #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { @@ -78,14 +78,14 @@ void FIR::runSeqVariant(VariantID vid) RAJA::RangeSegment(ibegin, iend), fir_lam); } - stopTimer(); + stopTimer(); break; } #endif // RUN_RAJA_SEQ default : { - std::cout << "\n FIR : Unknown variant id = " << vid << std::endl; + getCout() << "\n FIR : Unknown variant id = " << vid << std::endl; } } diff --git a/src/apps/HALOEXCHANGE-Cuda.cpp b/src/apps/HALOEXCHANGE-Cuda.cpp index 4633b4a7f..e093b57b7 100644 --- a/src/apps/HALOEXCHANGE-Cuda.cpp +++ b/src/apps/HALOEXCHANGE-Cuda.cpp @@ -166,7 +166,7 @@ void HALOEXCHANGE::runCudaVariant(VariantID vid) HALOEXCHANGE_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n HALOEXCHANGE : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n HALOEXCHANGE : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/apps/HALOEXCHANGE-Hip.cpp b/src/apps/HALOEXCHANGE-Hip.cpp index fd6fac040..a6e1b1ef7 100644 --- a/src/apps/HALOEXCHANGE-Hip.cpp +++ b/src/apps/HALOEXCHANGE-Hip.cpp @@ -168,7 +168,7 @@ void HALOEXCHANGE::runHipVariant(VariantID vid) HALOEXCHANGE_DATA_TEARDOWN_HIP; } else { - std::cout << "\n HALOEXCHANGE : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n HALOEXCHANGE : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/apps/HALOEXCHANGE-OMP.cpp b/src/apps/HALOEXCHANGE-OMP.cpp index 70e9419a8..7cac3ca3c 100644 --- a/src/apps/HALOEXCHANGE-OMP.cpp +++ b/src/apps/HALOEXCHANGE-OMP.cpp @@ -158,12 +158,12 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n HALOEXCHANGE : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALOEXCHANGE : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/apps/HALOEXCHANGE-OMPTarget.cpp b/src/apps/HALOEXCHANGE-OMPTarget.cpp index cfb10f7ec..22fe54522 100644 --- a/src/apps/HALOEXCHANGE-OMPTarget.cpp +++ b/src/apps/HALOEXCHANGE-OMPTarget.cpp @@ -146,7 +146,7 @@ void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid) HALOEXCHANGE_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n HALOEXCHANGE : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n HALOEXCHANGE : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/apps/HALOEXCHANGE-Seq.cpp b/src/apps/HALOEXCHANGE-Seq.cpp index 7ebace6f7..7a5ae5e17 100644 --- a/src/apps/HALOEXCHANGE-Seq.cpp +++ b/src/apps/HALOEXCHANGE-Seq.cpp @@ -154,7 +154,7 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n HALOEXCHANGE : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALOEXCHANGE : Unknown variant id = " << vid << std::endl; } } diff --git a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp index 114ed61ba..21a00b545 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp @@ -267,7 +267,7 @@ void HALOEXCHANGE_FUSED::runCudaVariant(VariantID vid) HALOEXCHANGE_FUSED_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n HALOEXCHANGE_FUSED : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n HALOEXCHANGE_FUSED : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp index 1288f9429..60b925422 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp @@ -270,7 +270,7 @@ void HALOEXCHANGE_FUSED::runHipVariant(VariantID vid) HALOEXCHANGE_FUSED_DATA_TEARDOWN_HIP; } else { - std::cout << "\n HALOEXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n HALOEXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp b/src/apps/HALOEXCHANGE_FUSED-OMP.cpp index 71a7fe22f..0654cd6dc 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-OMP.cpp @@ -297,7 +297,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; } } diff --git a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp index 8f8199026..77323855c 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp @@ -261,7 +261,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid) HALOEXCHANGE_FUSED_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n HALOEXCHANGE_FUSED : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n HALOEXCHANGE_FUSED : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp b/src/apps/HALOEXCHANGE_FUSED-Seq.cpp index 4bb5207b7..80eab8629 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Seq.cpp @@ -229,7 +229,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; } } diff --git a/src/apps/LTIMES-Cuda.cpp b/src/apps/LTIMES-Cuda.cpp index f1f47b5a2..e59bec74e 100644 --- a/src/apps/LTIMES-Cuda.cpp +++ b/src/apps/LTIMES-Cuda.cpp @@ -49,14 +49,14 @@ constexpr size_t m_block_sz = 32; deallocCudaDeviceData(psidat); __global__ void ltimes(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat, - Index_type num_d, + Index_type num_d, Index_type num_m, Index_type num_g, Index_type num_z) { Index_type m = blockIdx.x * blockDim.x + threadIdx.x; Index_type g = blockIdx.y * blockDim.y + threadIdx.y; Index_type z = blockIdx.z * blockDim.z + threadIdx.z; - if (m < num_m && g < num_g && z < num_z) { + if (m < num_m && g < num_g && z < num_z) { for (Index_type d = 0; d < num_d; ++d ) { LTIMES_BODY; } @@ -94,7 +94,7 @@ void LTIMES::runCudaVariant(VariantID vid) LTIMES_NBLOCKS_CUDA; ltimes<<>>(phidat, elldat, psidat, - num_d, + num_d, num_m, num_g, num_z); cudaErrchk( cudaGetLastError() ); @@ -139,9 +139,9 @@ void LTIMES::runCudaVariant(VariantID vid) RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_z_direct, RAJA::statement::Tile<2, RAJA::tile_fixed, - RAJA::cuda_block_y_direct, + RAJA::cuda_block_y_direct, RAJA::statement::Tile<3, RAJA::tile_fixed, - RAJA::cuda_block_x_direct, + RAJA::cuda_block_x_direct, RAJA::statement::For<1, RAJA::cuda_thread_z_direct, //z RAJA::statement::For<2, RAJA::cuda_thread_y_direct, //g RAJA::statement::For<3, RAJA::cuda_thread_x_direct, //m @@ -174,7 +174,7 @@ void LTIMES::runCudaVariant(VariantID vid) LTIMES_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n LTIMES : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n LTIMES : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/apps/LTIMES-Hip.cpp b/src/apps/LTIMES-Hip.cpp index 4d28aa028..66da9b051 100644 --- a/src/apps/LTIMES-Hip.cpp +++ b/src/apps/LTIMES-Hip.cpp @@ -93,10 +93,10 @@ void LTIMES::runHipVariant(VariantID vid) LTIMES_THREADS_PER_BLOCK_HIP; LTIMES_NBLOCKS_HIP; - hipLaunchKernelGGL((ltimes), + hipLaunchKernelGGL((ltimes), dim3(nblocks), dim3(nthreads_per_block), 0, 0, phidat, elldat, psidat, - num_d, + num_d, num_m, num_g, num_z); hipErrchk( hipGetLastError() ); @@ -115,7 +115,7 @@ void LTIMES::runHipVariant(VariantID vid) LTIMES_THREADS_PER_BLOCK_HIP; LTIMES_NBLOCKS_HIP; - auto ltimes_lambda = + auto ltimes_lambda = [=] __device__ (Index_type z, Index_type g, Index_type m) { for (Index_type d = 0; d < num_d; ++d ) { LTIMES_BODY; @@ -179,7 +179,7 @@ void LTIMES::runHipVariant(VariantID vid) LTIMES_DATA_TEARDOWN_HIP; } else { - std::cout << "\n LTIMES : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n LTIMES : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/apps/LTIMES-OMP.cpp b/src/apps/LTIMES-OMP.cpp index 397d2bc11..5ba4671a5 100644 --- a/src/apps/LTIMES-OMP.cpp +++ b/src/apps/LTIMES-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace apps { @@ -42,7 +42,7 @@ void LTIMES::runOpenMPVariant(VariantID vid) } } } - } + } } stopTimer(); @@ -52,7 +52,7 @@ void LTIMES::runOpenMPVariant(VariantID vid) case Lambda_OpenMP : { - auto ltimes_base_lam = [=](Index_type d, Index_type z, + auto ltimes_base_lam = [=](Index_type d, Index_type z, Index_type g, Index_type m) { LTIMES_BODY; }; @@ -85,7 +85,7 @@ void LTIMES::runOpenMPVariant(VariantID vid) LTIMES_BODY_RAJA; }; - using EXEC_POL = + using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::omp_parallel_for_exec, // z RAJA::statement::For<2, RAJA::loop_exec, // g @@ -93,7 +93,7 @@ void LTIMES::runOpenMPVariant(VariantID vid) RAJA::statement::For<0, RAJA::loop_exec, // d RAJA::statement::Lambda<0> > - > + > > > >; @@ -104,7 +104,7 @@ void LTIMES::runOpenMPVariant(VariantID vid) RAJA::kernel( RAJA::make_tuple(IDRange(0, num_d), IZRange(0, num_z), IGRange(0, num_g), - IMRange(0, num_m)), + IMRange(0, num_m)), ltimes_lam ); @@ -115,12 +115,12 @@ void LTIMES::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n LTIMES : Unknown variant id = " << vid << std::endl; + getCout() << "\n LTIMES : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/apps/LTIMES-OMPTarget.cpp b/src/apps/LTIMES-OMPTarget.cpp index 07f93643b..656900895 100644 --- a/src/apps/LTIMES-OMPTarget.cpp +++ b/src/apps/LTIMES-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace apps { @@ -72,7 +72,7 @@ void LTIMES::runOpenMPTargetVariant(VariantID vid) LTIMES_VIEWS_RANGES_RAJA; - using EXEC_POL = + using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::Collapse, // z, g, m @@ -91,7 +91,7 @@ void LTIMES::runOpenMPTargetVariant(VariantID vid) IMRange(0, num_m)), [=] (ID d, IZ z, IG g, IM m) { LTIMES_BODY_RAJA; - }); + }); } stopTimer(); @@ -99,7 +99,7 @@ void LTIMES::runOpenMPTargetVariant(VariantID vid) LTIMES_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n LTIMES : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n LTIMES : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/apps/LTIMES-Seq.cpp b/src/apps/LTIMES-Seq.cpp index a6b4c6fe8..efa06701e 100644 --- a/src/apps/LTIMES-Seq.cpp +++ b/src/apps/LTIMES-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace apps { @@ -45,12 +45,12 @@ void LTIMES::runSeqVariant(VariantID vid) stopTimer(); break; - } + } #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { - auto ltimes_base_lam = [=](Index_type d, Index_type z, + auto ltimes_base_lam = [=](Index_type d, Index_type z, Index_type g, Index_type m) { LTIMES_BODY; }; @@ -83,7 +83,7 @@ void LTIMES::runSeqVariant(VariantID vid) }; - using EXEC_POL = + using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::loop_exec, // z RAJA::statement::For<2, RAJA::loop_exec, // g @@ -94,27 +94,27 @@ void LTIMES::runSeqVariant(VariantID vid) > > > - >; + >; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - + RAJA::kernel( RAJA::make_tuple(IDRange(0, num_d), IZRange(0, num_z), IGRange(0, num_g), - IMRange(0, num_m)), + IMRange(0, num_m)), ltimes_lam ); } - stopTimer(); + stopTimer(); break; } #endif // RUN_RAJA_SEQ default : { - std::cout << "\n LTIMES : Unknown variant id = " << vid << std::endl; + getCout() << "\n LTIMES : Unknown variant id = " << vid << std::endl; } } diff --git a/src/apps/LTIMES_NOVIEW-Cuda.cpp b/src/apps/LTIMES_NOVIEW-Cuda.cpp index 1a7403ece..eb56fdf79 100644 --- a/src/apps/LTIMES_NOVIEW-Cuda.cpp +++ b/src/apps/LTIMES_NOVIEW-Cuda.cpp @@ -173,7 +173,7 @@ void LTIMES_NOVIEW::runCudaVariant(VariantID vid) LTIMES_NOVIEW_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n LTIMES_NOVIEW : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n LTIMES_NOVIEW : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/apps/LTIMES_NOVIEW-Hip.cpp b/src/apps/LTIMES_NOVIEW-Hip.cpp index 8f36737d8..31ab12979 100644 --- a/src/apps/LTIMES_NOVIEW-Hip.cpp +++ b/src/apps/LTIMES_NOVIEW-Hip.cpp @@ -93,10 +93,10 @@ void LTIMES_NOVIEW::runHipVariant(VariantID vid) LTIMES_NOVIEW_THREADS_PER_BLOCK_HIP; LTIMES_NOVIEW_NBLOCKS_HIP; - hipLaunchKernelGGL((ltimes_noview), - dim3(nblocks), dim3(nthreads_per_block), 0, 0, + hipLaunchKernelGGL((ltimes_noview), + dim3(nblocks), dim3(nthreads_per_block), 0, 0, phidat, elldat, psidat, - num_d, + num_d, num_m, num_g, num_z); hipErrchk( hipGetLastError() ); @@ -115,15 +115,15 @@ void LTIMES_NOVIEW::runHipVariant(VariantID vid) LTIMES_NOVIEW_THREADS_PER_BLOCK_HIP; LTIMES_NOVIEW_NBLOCKS_HIP; - auto ltimes_noview_lambda = + auto ltimes_noview_lambda = [=] __device__ (Index_type z, Index_type g, Index_type m) { for (Index_type d = 0; d < num_d; ++d ) { LTIMES_NOVIEW_BODY; } }; - hipLaunchKernelGGL((ltimes_noview_lam), - dim3(nblocks), dim3(nthreads_per_block), 0, 0, + hipLaunchKernelGGL((ltimes_noview_lam), + dim3(nblocks), dim3(nthreads_per_block), 0, 0, num_m, num_g, num_z, ltimes_noview_lambda); hipErrchk( hipGetLastError() ); @@ -178,7 +178,7 @@ void LTIMES_NOVIEW::runHipVariant(VariantID vid) LTIMES_NOVIEW_DATA_TEARDOWN_HIP; } else { - std::cout << "\n LTIMES_NOVIEW : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n LTIMES_NOVIEW : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/apps/LTIMES_NOVIEW-OMP.cpp b/src/apps/LTIMES_NOVIEW-OMP.cpp index f47fec499..7f6fedca2 100644 --- a/src/apps/LTIMES_NOVIEW-OMP.cpp +++ b/src/apps/LTIMES_NOVIEW-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace apps { @@ -25,8 +25,8 @@ void LTIMES_NOVIEW::runOpenMPVariant(VariantID vid) const Index_type run_reps = getRunReps(); LTIMES_NOVIEW_DATA_SETUP; - - auto ltimesnoview_lam = [=](Index_type d, Index_type z, + + auto ltimesnoview_lam = [=](Index_type d, Index_type z, Index_type g, Index_type m) { LTIMES_NOVIEW_BODY; }; @@ -47,7 +47,7 @@ void LTIMES_NOVIEW::runOpenMPVariant(VariantID vid) } } } - } + } } stopTimer(); @@ -109,12 +109,12 @@ void LTIMES_NOVIEW::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n LTIMES_NOVIEW : Unknown variant id = " << vid << std::endl; + getCout() << "\n LTIMES_NOVIEW : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/apps/LTIMES_NOVIEW-OMPTarget.cpp b/src/apps/LTIMES_NOVIEW-OMPTarget.cpp index 3734b889f..efe3a6ce3 100644 --- a/src/apps/LTIMES_NOVIEW-OMPTarget.cpp +++ b/src/apps/LTIMES_NOVIEW-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace apps { @@ -97,7 +97,7 @@ void LTIMES_NOVIEW::runOpenMPTargetVariant(VariantID vid) LTIMES_NOVIEW_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n LTIMES_NOVIEW : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n LTIMES_NOVIEW : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/apps/LTIMES_NOVIEW-Seq.cpp b/src/apps/LTIMES_NOVIEW-Seq.cpp index 7d98e5c53..0f2458ac3 100644 --- a/src/apps/LTIMES_NOVIEW-Seq.cpp +++ b/src/apps/LTIMES_NOVIEW-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace apps { @@ -23,8 +23,8 @@ void LTIMES_NOVIEW::runSeqVariant(VariantID vid) const Index_type run_reps = getRunReps(); LTIMES_NOVIEW_DATA_SETUP; - - auto ltimesnoview_lam = [=](Index_type d, Index_type z, + + auto ltimesnoview_lam = [=](Index_type d, Index_type z, Index_type g, Index_type m) { LTIMES_NOVIEW_BODY; }; @@ -50,7 +50,7 @@ void LTIMES_NOVIEW::runSeqVariant(VariantID vid) stopTimer(); break; - } + } #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { @@ -100,14 +100,14 @@ void LTIMES_NOVIEW::runSeqVariant(VariantID vid) ); } - stopTimer(); + stopTimer(); break; } #endif // RUN_RAJA_SEQ default : { - std::cout << "\n LTIMES_NOVIEW : Unknown variant id = " << vid << std::endl; + getCout() << "\n LTIMES_NOVIEW : Unknown variant id = " << vid << std::endl; } } diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp index 533ee5dd8..6354e01f4 100644 --- a/src/apps/MASS3DPA-Cuda.cpp +++ b/src/apps/MASS3DPA-Cuda.cpp @@ -197,7 +197,7 @@ void MASS3DPA::runCudaVariant(VariantID vid) { [&](int qx) { MASS3DPA_4 } - ); // RAJA::expt::loop + ); // RAJA::expt::loop } ); // RAJA::expt::loop @@ -233,7 +233,7 @@ void MASS3DPA::runCudaVariant(VariantID vid) { [&](int dx) { MASS3DPA_7 } - ); // RAJA::expt::loop + ); // RAJA::expt::loop } ); // RAJA::expt::loop @@ -277,7 +277,7 @@ void MASS3DPA::runCudaVariant(VariantID vid) { default: { - std::cout << "\n MASS3DPA : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n MASS3DPA : Unknown Cuda variant id = " << vid << std::endl; break; } } diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index 12caf28c1..185ad7bf6 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -169,7 +169,7 @@ void MASS3DPA::runHipVariant(VariantID vid) { [&](int dx) { MASS3DPA_1 } - ); // RAJA::expt::loop + ); // RAJA::expt::loop RAJA::expt::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), [&](int dx) { @@ -279,7 +279,7 @@ void MASS3DPA::runHipVariant(VariantID vid) { default: { - std::cout << "\n MASS3DPA : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n MASS3DPA : Unknown Hip variant id = " << vid << std::endl; break; } } diff --git a/src/apps/MASS3DPA-OMP.cpp b/src/apps/MASS3DPA-OMP.cpp index 3aa09157c..95342832c 100644 --- a/src/apps/MASS3DPA-OMP.cpp +++ b/src/apps/MASS3DPA-OMP.cpp @@ -197,7 +197,7 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { MASS3DPA_6 } ); // RAJA::expt::loop - } + } ); // RAJA::expt::loop ctx.teamSync(); @@ -236,8 +236,8 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { } ); // RAJA::expt::loop - } // lambda (e) - ); // RAJA::expt::loop + } // lambda (e) + ); // RAJA::expt::loop } // outer lambda (ctx) ); // // RAJA::expt::launch @@ -249,11 +249,11 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { } default: - std::cout << "\n MASS3DPA : Unknown OpenMP variant id = " << vid + getCout() << "\n MASS3DPA : Unknown OpenMP variant id = " << vid << std::endl; } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/apps/MASS3DPA-OMPTarget.cpp b/src/apps/MASS3DPA-OMPTarget.cpp index 6b61fa056..3ba732420 100644 --- a/src/apps/MASS3DPA-OMPTarget.cpp +++ b/src/apps/MASS3DPA-OMPTarget.cpp @@ -27,7 +27,7 @@ void MASS3DPA::runOpenMPTargetVariant(VariantID vid) { default: { - std::cout << "\n MASS3DPA : Unknown OpenMPTarget variant id = " << vid << std::endl; + getCout() << "\n MASS3DPA : Unknown OpenMPTarget variant id = " << vid << std::endl; break; } } diff --git a/src/apps/MASS3DPA-Seq.cpp b/src/apps/MASS3DPA-Seq.cpp index 66bd41db3..710826042 100644 --- a/src/apps/MASS3DPA-Seq.cpp +++ b/src/apps/MASS3DPA-Seq.cpp @@ -128,7 +128,7 @@ void MASS3DPA::runSeqVariant(VariantID vid) { RAJA::expt::HOST, RAJA::expt::Grid(), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), [&](int e) { MASS3DPA_0_CPU @@ -247,7 +247,7 @@ void MASS3DPA::runSeqVariant(VariantID vid) { #endif // RUN_RAJA_SEQ default: - std::cout << "\n MASS3DPA : Unknown Seq variant id = " << vid << std::endl; + getCout() << "\n MASS3DPA : Unknown Seq variant id = " << vid << std::endl; } } diff --git a/src/apps/PRESSURE-Cuda.cpp b/src/apps/PRESSURE-Cuda.cpp index b0d5ab615..325ef4730 100644 --- a/src/apps/PRESSURE-Cuda.cpp +++ b/src/apps/PRESSURE-Cuda.cpp @@ -133,7 +133,7 @@ void PRESSURE::runCudaVariant(VariantID vid) PRESSURE_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n PRESSURE : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n PRESSURE : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/apps/PRESSURE-Hip.cpp b/src/apps/PRESSURE-Hip.cpp index 646fbc703..2c2f584ef 100644 --- a/src/apps/PRESSURE-Hip.cpp +++ b/src/apps/PRESSURE-Hip.cpp @@ -126,7 +126,7 @@ void PRESSURE::runHipVariant(VariantID vid) PRESSURE_DATA_TEARDOWN_HIP; } else { - std::cout << "\n PRESSURE : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n PRESSURE : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/apps/PRESSURE-OMP.cpp b/src/apps/PRESSURE-OMP.cpp index 0d7182dfd..1393ff89b 100644 --- a/src/apps/PRESSURE-OMP.cpp +++ b/src/apps/PRESSURE-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace apps { @@ -34,7 +34,7 @@ void PRESSURE::runOpenMPVariant(VariantID vid) auto pressure_lam2 = [=](Index_type i) { PRESSURE_BODY2; }; - + switch ( vid ) { case Base_OpenMP : { @@ -111,12 +111,12 @@ void PRESSURE::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n PRESSURE : Unknown variant id = " << vid << std::endl; + getCout() << "\n PRESSURE : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/apps/PRESSURE-OMPTarget.cpp b/src/apps/PRESSURE-OMPTarget.cpp index 643302b46..90212cb30 100644 --- a/src/apps/PRESSURE-OMPTarget.cpp +++ b/src/apps/PRESSURE-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace apps { @@ -50,9 +50,9 @@ void PRESSURE::runOpenMPTargetVariant(VariantID vid) const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); - + PRESSURE_DATA_SETUP; - + if ( vid == Base_OpenMPTarget ) { PRESSURE_DATA_SETUP_OMP_TARGET; @@ -61,13 +61,13 @@ void PRESSURE::runOpenMPTargetVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { #pragma omp target is_device_ptr(compression, bvc) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) for (Index_type i = ibegin; i < iend; ++i ) { PRESSURE_BODY1; } #pragma omp target is_device_ptr(bvc, p_new, e_old, vnewc) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) for (Index_type i = ibegin; i < iend; ++i ) { PRESSURE_BODY2; } @@ -104,7 +104,7 @@ void PRESSURE::runOpenMPTargetVariant(VariantID vid) PRESSURE_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n PRESSURE : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n PRESSURE : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/apps/PRESSURE-Seq.cpp b/src/apps/PRESSURE-Seq.cpp index 77ce1200e..1b56941a3 100644 --- a/src/apps/PRESSURE-Seq.cpp +++ b/src/apps/PRESSURE-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace apps { @@ -32,7 +32,7 @@ void PRESSURE::runSeqVariant(VariantID vid) auto pressure_lam2 = [=](Index_type i) { PRESSURE_BODY2; }; - + switch ( vid ) { case Base_Seq : { @@ -52,7 +52,7 @@ void PRESSURE::runSeqVariant(VariantID vid) stopTimer(); break; - } + } #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { @@ -90,14 +90,14 @@ void PRESSURE::runSeqVariant(VariantID vid) }); // end sequential region (for single-source code) } - stopTimer(); + stopTimer(); break; } #endif // RUN_RAJA_SEQ default : { - std::cout << "\n PRESSURE : Unknown variant id = " << vid << std::endl; + getCout() << "\n PRESSURE : Unknown variant id = " << vid << std::endl; } } diff --git a/src/apps/VOL3D-Cuda.cpp b/src/apps/VOL3D-Cuda.cpp index 79db31282..dc7e47414 100644 --- a/src/apps/VOL3D-Cuda.cpp +++ b/src/apps/VOL3D-Cuda.cpp @@ -122,7 +122,7 @@ void VOL3D::runCudaVariant(VariantID vid) VOL3D_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n VOL3D : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n VOL3D : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/apps/VOL3D-Hip.cpp b/src/apps/VOL3D-Hip.cpp index 978c794ce..80a6a9a71 100644 --- a/src/apps/VOL3D-Hip.cpp +++ b/src/apps/VOL3D-Hip.cpp @@ -122,7 +122,7 @@ void VOL3D::runHipVariant(VariantID vid) VOL3D_DATA_TEARDOWN_HIP; } else { - std::cout << "\n VOL3D : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n VOL3D : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/apps/VOL3D-OMP.cpp b/src/apps/VOL3D-OMP.cpp index 90b84f857..276fda6ee 100644 --- a/src/apps/VOL3D-OMP.cpp +++ b/src/apps/VOL3D-OMP.cpp @@ -14,7 +14,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace apps { @@ -45,7 +45,7 @@ void VOL3D::runOpenMPVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - #pragma omp parallel for + #pragma omp parallel for for (Index_type i = ibegin ; i < iend ; ++i ) { VOL3D_BODY; } @@ -87,12 +87,12 @@ void VOL3D::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n VOL3D : Unknown variant id = " << vid << std::endl; + getCout() << "\n VOL3D : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/apps/VOL3D-OMPTarget.cpp b/src/apps/VOL3D-OMPTarget.cpp index 6a8de52c8..38771ae62 100644 --- a/src/apps/VOL3D-OMPTarget.cpp +++ b/src/apps/VOL3D-OMPTarget.cpp @@ -18,7 +18,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace apps { @@ -68,7 +68,7 @@ void VOL3D::runOpenMPTargetVariant(VariantID vid) y0,y1,y2,y3,y4,y5,y6,y7, \ z0,z1,z2,z3,z4,z5,z6,z7, \ vol) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) for (Index_type i = ibegin ; i < iend ; ++i ) { VOL3D_BODY; } @@ -76,7 +76,7 @@ void VOL3D::runOpenMPTargetVariant(VariantID vid) } stopTimer(); - VOL3D_DATA_TEARDOWN_OMP_TARGET; + VOL3D_DATA_TEARDOWN_OMP_TARGET; } else if ( vid == RAJA_OpenMPTarget ) { @@ -98,10 +98,10 @@ void VOL3D::runOpenMPTargetVariant(VariantID vid) } stopTimer(); - VOL3D_DATA_TEARDOWN_OMP_TARGET; + VOL3D_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n VOL3D : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n VOL3D : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/apps/VOL3D-Seq.cpp b/src/apps/VOL3D-Seq.cpp index e748c3ffd..f2fec0ef8 100644 --- a/src/apps/VOL3D-Seq.cpp +++ b/src/apps/VOL3D-Seq.cpp @@ -14,7 +14,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace apps { @@ -51,7 +51,7 @@ void VOL3D::runSeqVariant(VariantID vid) stopTimer(); break; - } + } #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { @@ -78,14 +78,14 @@ void VOL3D::runSeqVariant(VariantID vid) RAJA::RangeSegment(ibegin, iend), vol3d_lam); } - stopTimer(); + stopTimer(); break; } #endif // RUN_RAJA_SEQ default : { - std::cout << "\n VOL3D : Unknown variant id = " << vid << std::endl; + getCout() << "\n VOL3D : Unknown variant id = " << vid << std::endl; } } diff --git a/src/apps/WIP-COUPLE.cpp b/src/apps/WIP-COUPLE.cpp index 51bb4fa2f..b849f70f9 100644 --- a/src/apps/WIP-COUPLE.cpp +++ b/src/apps/WIP-COUPLE.cpp @@ -173,7 +173,7 @@ void COUPLE::runKernel(VariantID vid) #endif default : { - std::cout << "\n COUPLE : Unknown variant id = " << vid << std::endl; + getCout() << "\n COUPLE : Unknown variant id = " << vid << std::endl; } } diff --git a/src/basic/DAXPY-Cuda.cpp b/src/basic/DAXPY-Cuda.cpp index 7e4f52fed..260435e1a 100644 --- a/src/basic/DAXPY-Cuda.cpp +++ b/src/basic/DAXPY-Cuda.cpp @@ -108,7 +108,7 @@ void DAXPY::runCudaVariant(VariantID vid) DAXPY_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n DAXPY : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n DAXPY : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/basic/DAXPY-Hip.cpp b/src/basic/DAXPY-Hip.cpp index 1ed22ef76..95cd36fe8 100644 --- a/src/basic/DAXPY-Hip.cpp +++ b/src/basic/DAXPY-Hip.cpp @@ -111,7 +111,7 @@ void DAXPY::runHipVariant(VariantID vid) DAXPY_DATA_TEARDOWN_HIP; } else { - std::cout << "\n DAXPY : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n DAXPY : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/basic/DAXPY-OMP.cpp b/src/basic/DAXPY-OMP.cpp index 5a06f5b46..f28c83c7b 100644 --- a/src/basic/DAXPY-OMP.cpp +++ b/src/basic/DAXPY-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -81,12 +81,12 @@ void DAXPY::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n DAXPY : Unknown variant id = " << vid << std::endl; + getCout() << "\n DAXPY : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/basic/DAXPY-OMPTarget.cpp b/src/basic/DAXPY-OMPTarget.cpp index 286003a5d..930438bbc 100644 --- a/src/basic/DAXPY-OMPTarget.cpp +++ b/src/basic/DAXPY-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -81,9 +81,9 @@ void DAXPY::runOpenMPTargetVariant(VariantID vid) stopTimer(); DAXPY_DATA_TEARDOWN_OMP_TARGET; - + } else { - std::cout << "\n DAXPY : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n DAXPY : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/basic/DAXPY-Seq.cpp b/src/basic/DAXPY-Seq.cpp index 325297cd5..2eb2fc690 100644 --- a/src/basic/DAXPY-Seq.cpp +++ b/src/basic/DAXPY-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -79,7 +79,7 @@ void DAXPY::runSeqVariant(VariantID vid) #endif default : { - std::cout << "\n DAXPY : Unknown variant id = " << vid << std::endl; + getCout() << "\n DAXPY : Unknown variant id = " << vid << std::endl; } } diff --git a/src/basic/IF_QUAD-Cuda.cpp b/src/basic/IF_QUAD-Cuda.cpp index 8c7f9fa11..8790bbcbb 100644 --- a/src/basic/IF_QUAD-Cuda.cpp +++ b/src/basic/IF_QUAD-Cuda.cpp @@ -115,7 +115,7 @@ void IF_QUAD::runCudaVariant(VariantID vid) IF_QUAD_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n IF_QUAD : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n IF_QUAD : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/basic/IF_QUAD-Hip.cpp b/src/basic/IF_QUAD-Hip.cpp index 49557e3e8..7e903e086 100644 --- a/src/basic/IF_QUAD-Hip.cpp +++ b/src/basic/IF_QUAD-Hip.cpp @@ -118,7 +118,7 @@ void IF_QUAD::runHipVariant(VariantID vid) IF_QUAD_DATA_TEARDOWN_HIP; } else { - std::cout << "\n IF_QUAD : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n IF_QUAD : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/basic/IF_QUAD-OMP.cpp b/src/basic/IF_QUAD-OMP.cpp index 659d8a12a..517814c6c 100644 --- a/src/basic/IF_QUAD-OMP.cpp +++ b/src/basic/IF_QUAD-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -81,12 +81,12 @@ void IF_QUAD::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n IF_QUAD : Unknown variant id = " << vid << std::endl; + getCout() << "\n IF_QUAD : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/basic/IF_QUAD-OMPTarget.cpp b/src/basic/IF_QUAD-OMPTarget.cpp index 0a16fccc8..e44711ecb 100644 --- a/src/basic/IF_QUAD-OMPTarget.cpp +++ b/src/basic/IF_QUAD-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -61,7 +61,7 @@ void IF_QUAD::runOpenMPTargetVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { #pragma omp target is_device_ptr(a, b, c, x1, x2) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) for (Index_type i = ibegin; i < iend; ++i ) { IF_QUAD_BODY; } @@ -89,7 +89,7 @@ void IF_QUAD::runOpenMPTargetVariant(VariantID vid) IF_QUAD_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n IF_QUAD : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n IF_QUAD : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/basic/IF_QUAD-Seq.cpp b/src/basic/IF_QUAD-Seq.cpp index 051e513b0..a5e9a9c6c 100644 --- a/src/basic/IF_QUAD-Seq.cpp +++ b/src/basic/IF_QUAD-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -47,7 +47,7 @@ void IF_QUAD::runSeqVariant(VariantID vid) break; } -#if defined(RUN_RAJA_SEQ) +#if defined(RUN_RAJA_SEQ) case Lambda_Seq : { startTimer(); @@ -79,7 +79,7 @@ void IF_QUAD::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n IF_QUAD : Unknown variant id = " << vid << std::endl; + getCout() << "\n IF_QUAD : Unknown variant id = " << vid << std::endl; } } diff --git a/src/basic/INIT3-Cuda.cpp b/src/basic/INIT3-Cuda.cpp index cee3e46af..fd9ae6a35 100644 --- a/src/basic/INIT3-Cuda.cpp +++ b/src/basic/INIT3-Cuda.cpp @@ -117,7 +117,7 @@ void INIT3::runCudaVariant(VariantID vid) INIT3_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n INIT3 : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n INIT3 : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/basic/INIT3-Hip.cpp b/src/basic/INIT3-Hip.cpp index 51e0f2b54..9c7ce1ad3 100644 --- a/src/basic/INIT3-Hip.cpp +++ b/src/basic/INIT3-Hip.cpp @@ -119,7 +119,7 @@ void INIT3::runHipVariant(VariantID vid) INIT3_DATA_TEARDOWN_HIP; } else { - std::cout << "\n INIT3 : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n INIT3 : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/basic/INIT3-OMP.cpp b/src/basic/INIT3-OMP.cpp index 7d05f9af4..b5bd688e0 100644 --- a/src/basic/INIT3-OMP.cpp +++ b/src/basic/INIT3-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -81,12 +81,12 @@ void INIT3::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n INIT3 : Unknown variant id = " << vid << std::endl; + getCout() << "\n INIT3 : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/basic/INIT3-OMPTarget.cpp b/src/basic/INIT3-OMPTarget.cpp index 7d3f9ce05..ff7d69e63 100644 --- a/src/basic/INIT3-OMPTarget.cpp +++ b/src/basic/INIT3-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -89,9 +89,9 @@ void INIT3::runOpenMPTargetVariant(VariantID vid) stopTimer(); INIT3_DATA_TEARDOWN_OMP_TARGET; - + } else { - std::cout << "\n INIT3 : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n INIT3 : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/basic/INIT3-Seq.cpp b/src/basic/INIT3-Seq.cpp index b4c481632..66b8f5a7c 100644 --- a/src/basic/INIT3-Seq.cpp +++ b/src/basic/INIT3-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -23,7 +23,7 @@ void INIT3::runSeqVariant(VariantID vid) const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); - + INIT3_DATA_SETUP; auto init3_lam = [=](Index_type i) { @@ -79,7 +79,7 @@ void INIT3::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n INIT3 : Unknown variant id = " << vid << std::endl; + getCout() << "\n INIT3 : Unknown variant id = " << vid << std::endl; } } diff --git a/src/basic/INIT_VIEW1D-Cuda.cpp b/src/basic/INIT_VIEW1D-Cuda.cpp index af70a9980..5612bbc79 100644 --- a/src/basic/INIT_VIEW1D-Cuda.cpp +++ b/src/basic/INIT_VIEW1D-Cuda.cpp @@ -108,7 +108,7 @@ void INIT_VIEW1D::runCudaVariant(VariantID vid) INIT_VIEW1D_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n INIT_VIEW1D : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n INIT_VIEW1D : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/basic/INIT_VIEW1D-Hip.cpp b/src/basic/INIT_VIEW1D-Hip.cpp index 1136b0d93..316fe8fda 100644 --- a/src/basic/INIT_VIEW1D-Hip.cpp +++ b/src/basic/INIT_VIEW1D-Hip.cpp @@ -111,7 +111,7 @@ void INIT_VIEW1D::runHipVariant(VariantID vid) INIT_VIEW1D_DATA_TEARDOWN_HIP; } else { - std::cout << "\n INIT_VIEW1D : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n INIT_VIEW1D : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/basic/INIT_VIEW1D-OMP.cpp b/src/basic/INIT_VIEW1D-OMP.cpp index b36e7d44b..42cc23c39 100644 --- a/src/basic/INIT_VIEW1D-OMP.cpp +++ b/src/basic/INIT_VIEW1D-OMP.cpp @@ -87,12 +87,12 @@ void INIT_VIEW1D::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n INIT_VIEW1D : Unknown variant id = " << vid << std::endl; + getCout() << "\n INIT_VIEW1D : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/basic/INIT_VIEW1D-OMPTarget.cpp b/src/basic/INIT_VIEW1D-OMPTarget.cpp index 705d2fb6e..8ec0d5c13 100644 --- a/src/basic/INIT_VIEW1D-OMPTarget.cpp +++ b/src/basic/INIT_VIEW1D-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -47,13 +47,13 @@ void INIT_VIEW1D::runOpenMPTargetVariant(VariantID vid) if ( vid == Base_OpenMPTarget ) { - INIT_VIEW1D_DATA_SETUP_OMP_TARGET; + INIT_VIEW1D_DATA_SETUP_OMP_TARGET; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { #pragma omp target is_device_ptr(a) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) for (Index_type i = ibegin; i < iend; ++i ) { INIT_VIEW1D_BODY; } @@ -83,7 +83,7 @@ void INIT_VIEW1D::runOpenMPTargetVariant(VariantID vid) INIT_VIEW1D_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n INIT_VIEW1D : Unknown OMP Targetvariant id = " << vid << std::endl; + getCout() << "\n INIT_VIEW1D : Unknown OMP Targetvariant id = " << vid << std::endl; } } diff --git a/src/basic/INIT_VIEW1D-Seq.cpp b/src/basic/INIT_VIEW1D-Seq.cpp index 2cfa4514c..419e1698b 100644 --- a/src/basic/INIT_VIEW1D-Seq.cpp +++ b/src/basic/INIT_VIEW1D-Seq.cpp @@ -85,7 +85,7 @@ void INIT_VIEW1D::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n INIT_VIEW1D : Unknown variant id = " << vid << std::endl; + getCout() << "\n INIT_VIEW1D : Unknown variant id = " << vid << std::endl; } } diff --git a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp index 95c9e175e..8a5f62e2b 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp @@ -111,7 +111,7 @@ void INIT_VIEW1D_OFFSET::runCudaVariant(VariantID vid) INIT_VIEW1D_OFFSET_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n INIT_VIEW1D_OFFSET : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n INIT_VIEW1D_OFFSET : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp index b2e24a703..d0353c15c 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp @@ -112,7 +112,7 @@ void INIT_VIEW1D_OFFSET::runHipVariant(VariantID vid) INIT_VIEW1D_OFFSET_DATA_TEARDOWN_HIP; } else { - std::cout << "\n INIT_VIEW1D_OFFSET : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n INIT_VIEW1D_OFFSET : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp b/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp index feb271d31..1e07407aa 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -87,12 +87,12 @@ void INIT_VIEW1D_OFFSET::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n INIT_VIEW1D_OFFSET : Unknown variant id = " << vid << std::endl; + getCout() << "\n INIT_VIEW1D_OFFSET : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp index e419e7fca..9b3c3cdae 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -83,7 +83,7 @@ void INIT_VIEW1D_OFFSET::runOpenMPTargetVariant(VariantID vid) INIT_VIEW1D_OFFSET_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n INIT_VIEW1D_OFFSET : Unknown OMP Targetvariant id = " << vid << std::endl; + getCout() << "\n INIT_VIEW1D_OFFSET : Unknown OMP Targetvariant id = " << vid << std::endl; } } diff --git a/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp b/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp index 12297cdaf..f53872d14 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -85,7 +85,7 @@ void INIT_VIEW1D_OFFSET::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n INIT_VIEW1D_OFFSET : Unknown variant id = " << vid << std::endl; + getCout() << "\n INIT_VIEW1D_OFFSET : Unknown variant id = " << vid << std::endl; } } diff --git a/src/basic/MAT_MAT_SHARED-Cuda.cpp b/src/basic/MAT_MAT_SHARED-Cuda.cpp index a208c3692..d104b0a72 100644 --- a/src/basic/MAT_MAT_SHARED-Cuda.cpp +++ b/src/basic/MAT_MAT_SHARED-Cuda.cpp @@ -224,29 +224,29 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) { RAJA::expt::Threads(TL_SZ, TL_SZ)), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Ny), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Ny), [&](Index_type by) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Nx), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Nx), [&](Index_type bx) { MAT_MAT_SHARED_BODY_0 - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type ty) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { MAT_MAT_SHARED_BODY_1 } ); // RAJA::expt::loop - } + } ); // RAJA::expt::loop for (Index_type k = 0; k < (TL_SZ + N - 1) / TL_SZ; k++) { - + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type ty) { - RAJA::expt::loop(ctx, - RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, + RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { MAT_MAT_SHARED_BODY_2 } @@ -258,7 +258,7 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type ty) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { MAT_MAT_SHARED_BODY_3 } @@ -270,7 +270,7 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) { } // for (k) - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type ty) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { @@ -294,7 +294,7 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) { MAT_MAT_SHARED_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n MAT_MAT_SHARED : Unknown Cuda variant id = " << vid + getCout() << "\n MAT_MAT_SHARED : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/basic/MAT_MAT_SHARED-Hip.cpp b/src/basic/MAT_MAT_SHARED-Hip.cpp index d4ea505e5..dc5667597 100644 --- a/src/basic/MAT_MAT_SHARED-Hip.cpp +++ b/src/basic/MAT_MAT_SHARED-Hip.cpp @@ -264,13 +264,13 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { MAT_MAT_SHARED_BODY_3 - } + } ); // RAJA::expt::loop } ); // RAJA::expt::loop ctx.teamSync(); - + } // for (k) RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), @@ -297,7 +297,7 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) { MAT_MAT_SHARED_DATA_TEARDOWN_HIP; } else { - std::cout << "\n MAT_MAT_SHARED : Unknown Hip variant id = " << vid + getCout() << "\n MAT_MAT_SHARED : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/basic/MAT_MAT_SHARED-OMP.cpp b/src/basic/MAT_MAT_SHARED-OMP.cpp index c120646f6..e745a5a5e 100644 --- a/src/basic/MAT_MAT_SHARED-OMP.cpp +++ b/src/basic/MAT_MAT_SHARED-OMP.cpp @@ -196,7 +196,7 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) { RAJA::expt::launch(RAJA::expt::HOST, RAJA::expt::Grid(), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Ny), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Ny), [&](Index_type by) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Nx), [&](Index_type bx) { @@ -253,25 +253,25 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) { } // lambda (bx) ); // RAJA::expt::loop - } // lambda (by) + } // lambda (by) ); // RAJA::expt::loop } // outer lambda (ctx) - ); // RAJA::expt::launch + ); // RAJA::expt::launch - } // loop over kernel reps + } // loop over kernel reps stopTimer(); break; } default: { - std::cout << "\n MAT_MAT_SHARED : Unknown variant id = " << vid + getCout() << "\n MAT_MAT_SHARED : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/basic/MAT_MAT_SHARED-OMPTarget.cpp b/src/basic/MAT_MAT_SHARED-OMPTarget.cpp index 41925870d..b42b3b9d0 100644 --- a/src/basic/MAT_MAT_SHARED-OMPTarget.cpp +++ b/src/basic/MAT_MAT_SHARED-OMPTarget.cpp @@ -27,7 +27,7 @@ namespace basic { default: { - std::cout << "\n MAT_MAT_SHARED : Unknown OpenMPTarget variant id = " << vid << std::endl; + getCout() << "\n MAT_MAT_SHARED : Unknown OpenMPTarget variant id = " << vid << std::endl; break; } } diff --git a/src/basic/MAT_MAT_SHARED-Seq.cpp b/src/basic/MAT_MAT_SHARED-Seq.cpp index 00119d1b3..b15b4f018 100644 --- a/src/basic/MAT_MAT_SHARED-Seq.cpp +++ b/src/basic/MAT_MAT_SHARED-Seq.cpp @@ -193,9 +193,9 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) { RAJA::expt::launch(RAJA::expt::HOST, RAJA::expt::Grid(), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Ny), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Ny), [&](Index_type by) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Nx), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Nx), [&](Index_type bx) { MAT_MAT_SHARED_BODY_0 @@ -203,7 +203,7 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type ty) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { MAT_MAT_SHARED_BODY_1 } @@ -231,7 +231,7 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) { [&](Index_type tx) { MAT_MAT_SHARED_BODY_3 } - ); // RAJA::expt::loop + ); // RAJA::expt::loop } ); // RAJA::expt::loop @@ -265,7 +265,7 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) { #endif // RUN_RAJA_SEQ default: { - std::cout << "\n MAT_MAT_SHARED : Unknown variant id = " << vid + getCout() << "\n MAT_MAT_SHARED : Unknown variant id = " << vid << std::endl; } } diff --git a/src/basic/MULADDSUB-Cuda.cpp b/src/basic/MULADDSUB-Cuda.cpp index 106d11865..955ebbf67 100644 --- a/src/basic/MULADDSUB-Cuda.cpp +++ b/src/basic/MULADDSUB-Cuda.cpp @@ -117,7 +117,7 @@ void MULADDSUB::runCudaVariant(VariantID vid) MULADDSUB_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n MULADDSUB : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n MULADDSUB : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/basic/MULADDSUB-Hip.cpp b/src/basic/MULADDSUB-Hip.cpp index 729c6cee3..8f7794258 100644 --- a/src/basic/MULADDSUB-Hip.cpp +++ b/src/basic/MULADDSUB-Hip.cpp @@ -119,7 +119,7 @@ void MULADDSUB::runHipVariant(VariantID vid) MULADDSUB_DATA_TEARDOWN_HIP; } else { - std::cout << "\n MULADDSUB : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n MULADDSUB : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/basic/MULADDSUB-OMP.cpp b/src/basic/MULADDSUB-OMP.cpp index 1794a11d7..79d441bcc 100644 --- a/src/basic/MULADDSUB-OMP.cpp +++ b/src/basic/MULADDSUB-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -81,12 +81,12 @@ void MULADDSUB::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n MULADDSUB : Unknown variant id = " << vid << std::endl; + getCout() << "\n MULADDSUB : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/basic/MULADDSUB-OMPTarget.cpp b/src/basic/MULADDSUB-OMPTarget.cpp index 064628d61..bca0164e3 100644 --- a/src/basic/MULADDSUB-OMPTarget.cpp +++ b/src/basic/MULADDSUB-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -91,7 +91,7 @@ void MULADDSUB::runOpenMPTargetVariant(VariantID vid) MULADDSUB_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n MULADDSUB : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n MULADDSUB : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/basic/MULADDSUB-Seq.cpp b/src/basic/MULADDSUB-Seq.cpp index b4651c55f..e6fb9f913 100644 --- a/src/basic/MULADDSUB-Seq.cpp +++ b/src/basic/MULADDSUB-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -79,7 +79,7 @@ void MULADDSUB::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n MULADDSUB : Unknown variant id = " << vid << std::endl; + getCout() << "\n MULADDSUB : Unknown variant id = " << vid << std::endl; } } diff --git a/src/basic/NESTED_INIT-Cuda.cpp b/src/basic/NESTED_INIT-Cuda.cpp index 306a9a67a..fd68d76cd 100644 --- a/src/basic/NESTED_INIT-Cuda.cpp +++ b/src/basic/NESTED_INIT-Cuda.cpp @@ -59,11 +59,11 @@ __global__ void nested_init(Real_ptr array, template< typename Lambda > __global__ void nested_init_lam(Index_type ni, Index_type nj, Index_type nk, Lambda body) -{ +{ Index_type i = blockIdx.x * blockDim.x + threadIdx.x; Index_type j = blockIdx.y * blockDim.y + threadIdx.y; Index_type k = blockIdx.z; - + if ( i < ni && j < nj && k < nk ) { body(i, j, k); } @@ -85,7 +85,7 @@ void NESTED_INIT::runCudaVariant(VariantID vid) NESTED_INIT_THREADS_PER_BLOCK_CUDA; NESTED_INIT_NBLOCKS_CUDA; - + nested_init<<>>(array, ni, nj, nk); cudaErrchk( cudaGetLastError() ); @@ -124,13 +124,13 @@ void NESTED_INIT::runCudaVariant(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::CudaKernelFixedAsync, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_direct, - RAJA::statement::Tile<0, RAJA::tile_fixed, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_direct, RAJA::statement::For<2, RAJA::cuda_block_z_direct, // k RAJA::statement::For<1, RAJA::cuda_thread_y_direct, // j - RAJA::statement::For<0, RAJA::cuda_thread_x_direct, // i + RAJA::statement::For<0, RAJA::cuda_thread_x_direct, // i RAJA::statement::Lambda<0> > > @@ -157,7 +157,7 @@ void NESTED_INIT::runCudaVariant(VariantID vid) NESTED_INIT_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n NESTED_INIT : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n NESTED_INIT : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/basic/NESTED_INIT-Hip.cpp b/src/basic/NESTED_INIT-Hip.cpp index 4038a47a2..cc66556f7 100644 --- a/src/basic/NESTED_INIT-Hip.cpp +++ b/src/basic/NESTED_INIT-Hip.cpp @@ -86,8 +86,8 @@ void NESTED_INIT::runHipVariant(VariantID vid) NESTED_INIT_THREADS_PER_BLOCK_HIP; NESTED_INIT_NBLOCKS_HIP; - hipLaunchKernelGGL((nested_init), - dim3(nblocks), dim3(nthreads_per_block), 0, 0, + hipLaunchKernelGGL((nested_init), + dim3(nblocks), dim3(nthreads_per_block), 0, 0, array, ni, nj, nk); hipErrchk( hipGetLastError() ); @@ -106,12 +106,12 @@ void NESTED_INIT::runHipVariant(VariantID vid) NESTED_INIT_THREADS_PER_BLOCK_HIP; NESTED_INIT_NBLOCKS_HIP; - auto nested_init_lambda = [=] __device__ (Index_type i, Index_type j, + auto nested_init_lambda = [=] __device__ (Index_type i, Index_type j, Index_type k) { NESTED_INIT_BODY; }; - hipLaunchKernelGGL((nested_init_lam< decltype(nested_init_lambda) >), + hipLaunchKernelGGL((nested_init_lam< decltype(nested_init_lambda) >), dim3(nblocks), dim3(nthreads_per_block), 0, 0, ni, nj, nk, nested_init_lambda); hipErrchk( hipGetLastError() ); @@ -142,7 +142,7 @@ void NESTED_INIT::runHipVariant(VariantID vid) > > > - >; + >; startTimer(); @@ -161,7 +161,7 @@ void NESTED_INIT::runHipVariant(VariantID vid) NESTED_INIT_DATA_TEARDOWN_HIP; } else { - std::cout << "\n NESTED_INIT : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n NESTED_INIT : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/basic/NESTED_INIT-OMP.cpp b/src/basic/NESTED_INIT-OMP.cpp index b714712d5..a38521976 100644 --- a/src/basic/NESTED_INIT-OMP.cpp +++ b/src/basic/NESTED_INIT-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -24,7 +24,7 @@ namespace basic void NESTED_INIT::runOpenMPVariant(VariantID vid) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - + const Index_type run_reps = getRunReps(); NESTED_INIT_DATA_SETUP; @@ -94,7 +94,7 @@ void NESTED_INIT::runOpenMPVariant(VariantID vid) > >; #else - using EXEC_POL = + using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::For<2, RAJA::omp_parallel_for_exec, // k RAJA::statement::For<1, RAJA::loop_exec, // j @@ -122,12 +122,12 @@ void NESTED_INIT::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n NESTED_INIT : Unknown variant id = " << vid << std::endl; + getCout() << "\n NESTED_INIT : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/basic/NESTED_INIT-OMPTarget.cpp b/src/basic/NESTED_INIT-OMPTarget.cpp index 435df40c1..b140ede6a 100644 --- a/src/basic/NESTED_INIT-OMPTarget.cpp +++ b/src/basic/NESTED_INIT-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -46,14 +46,14 @@ void NESTED_INIT::runOpenMPTargetVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { #pragma omp target is_device_ptr(array) device( did ) - #pragma omp teams distribute parallel for schedule(static, 1) collapse(3) + #pragma omp teams distribute parallel for schedule(static, 1) collapse(3) for (Index_type k = 0; k < nk; ++k ) { for (Index_type j = 0; j < nj; ++j ) { for (Index_type i = 0; i < ni; ++i ) { NESTED_INIT_BODY; } } - } + } } stopTimer(); @@ -64,7 +64,7 @@ void NESTED_INIT::runOpenMPTargetVariant(VariantID vid) NESTED_INIT_DATA_SETUP_OMP_TARGET; - using EXEC_POL = + using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::Collapse, // k, j, i @@ -87,8 +87,8 @@ void NESTED_INIT::runOpenMPTargetVariant(VariantID vid) NESTED_INIT_DATA_TEARDOWN_OMP_TARGET; - } else { - std::cout << "\n NESTED_INIT : Unknown variant id = " << vid << std::endl; + } else { + getCout() << "\n NESTED_INIT : Unknown variant id = " << vid << std::endl; } } diff --git a/src/basic/NESTED_INIT-Seq.cpp b/src/basic/NESTED_INIT-Seq.cpp index f79cc9603..578a544e0 100644 --- a/src/basic/NESTED_INIT-Seq.cpp +++ b/src/basic/NESTED_INIT-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -71,7 +71,7 @@ void NESTED_INIT::runSeqVariant(VariantID vid) case RAJA_Seq : { - using EXEC_POL = + using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::For<2, RAJA::loop_exec, // k RAJA::statement::For<1, RAJA::loop_exec, // j @@ -99,7 +99,7 @@ void NESTED_INIT::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n NESTED_INIT : Unknown variant id = " << vid << std::endl; + getCout() << "\n NESTED_INIT : Unknown variant id = " << vid << std::endl; } } diff --git a/src/basic/PI_ATOMIC-Cuda.cpp b/src/basic/PI_ATOMIC-Cuda.cpp index 57522fed3..889af65a8 100644 --- a/src/basic/PI_ATOMIC-Cuda.cpp +++ b/src/basic/PI_ATOMIC-Cuda.cpp @@ -123,7 +123,7 @@ void PI_ATOMIC::runCudaVariant(VariantID vid) PI_ATOMIC_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n PI_ATOMIC : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n PI_ATOMIC : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp index 0910a4198..9b5bead53 100644 --- a/src/basic/PI_ATOMIC-Hip.cpp +++ b/src/basic/PI_ATOMIC-Hip.cpp @@ -125,7 +125,7 @@ void PI_ATOMIC::runHipVariant(VariantID vid) PI_ATOMIC_DATA_TEARDOWN_HIP; } else { - std::cout << "\n PI_ATOMIC : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n PI_ATOMIC : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/basic/PI_ATOMIC-OMP.cpp b/src/basic/PI_ATOMIC-OMP.cpp index b30352de7..555e22826 100644 --- a/src/basic/PI_ATOMIC-OMP.cpp +++ b/src/basic/PI_ATOMIC-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -40,7 +40,7 @@ void PI_ATOMIC::runOpenMPVariant(VariantID vid) for (Index_type i = ibegin; i < iend; ++i ) { double x = (double(i) + 0.5) * dx; #pragma omp atomic - *pi += dx / (1.0 + x * x); + *pi += dx / (1.0 + x * x); } *pi *= 4.0; @@ -80,7 +80,7 @@ void PI_ATOMIC::runOpenMPVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { *pi = m_pi_init; - RAJA::forall( + RAJA::forall( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { double x = (double(i) + 0.5) * dx; RAJA::atomicAdd(pi, dx / (1.0 + x * x)); @@ -94,12 +94,12 @@ void PI_ATOMIC::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n PI_ATOMIC : Unknown variant id = " << vid << std::endl; + getCout() << "\n PI_ATOMIC : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/basic/PI_ATOMIC-OMPTarget.cpp b/src/basic/PI_ATOMIC-OMPTarget.cpp index 08cc41167..0d6443423 100644 --- a/src/basic/PI_ATOMIC-OMPTarget.cpp +++ b/src/basic/PI_ATOMIC-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -52,7 +52,7 @@ void PI_ATOMIC::runOpenMPTargetVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { initOpenMPDeviceData(pi, &m_pi_init, 1, did, hid); - + #pragma omp target is_device_ptr(pi) device( did ) #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) for (Index_type i = ibegin; i < iend; ++i ) { @@ -84,16 +84,16 @@ void PI_ATOMIC::runOpenMPTargetVariant(VariantID vid) RAJA::atomicAdd(pi, dx / (1.0 + x * x)); }); - getOpenMPDeviceData(m_pi, pi, 1, hid, did); + getOpenMPDeviceData(m_pi, pi, 1, hid, did); *m_pi *= 4.0; } stopTimer(); PI_ATOMIC_DATA_TEARDOWN_OMP_TARGET; - + } else { - std::cout << "\n PI_ATOMIC : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n PI_ATOMIC : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/basic/PI_ATOMIC-Seq.cpp b/src/basic/PI_ATOMIC-Seq.cpp index 941062fed..0fee34737 100644 --- a/src/basic/PI_ATOMIC-Seq.cpp +++ b/src/basic/PI_ATOMIC-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -73,9 +73,9 @@ void PI_ATOMIC::runSeqVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - + *pi = m_pi_init; - RAJA::forall( RAJA::RangeSegment(ibegin, iend), + RAJA::forall( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { double x = (double(i) + 0.5) * dx; RAJA::atomicAdd(pi, dx / (1.0 + x * x)); @@ -90,7 +90,7 @@ void PI_ATOMIC::runSeqVariant(VariantID vid) #endif default : { - std::cout << "\n PI_ATOMIC : Unknown variant id = " << vid << std::endl; + getCout() << "\n PI_ATOMIC : Unknown variant id = " << vid << std::endl; } } diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index 0c9d38c13..1c8897fe8 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -38,7 +38,7 @@ __global__ void pi_reduce(Real_type dx, ppi[ threadIdx.x ] = pi_init; for ( ; i < iend ; i += gridDim.x * blockDim.x ) { double x = (double(i) + 0.5) * dx; - ppi[ threadIdx.x ] += dx / (1.0 + x * x); + ppi[ threadIdx.x ] += dx / (1.0 + x * x); } __syncthreads(); @@ -57,7 +57,7 @@ __global__ void pi_reduce(Real_type dx, if ( threadIdx.x == 0 ) { *dpi += ppi[ 0 ]; } -#endif +#endif } @@ -81,8 +81,8 @@ void PI_REDUCE::runCudaVariant(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); pi_reduce<<>>( dx, - dpi, m_pi_init, + sizeof(Real_type)*block_size>>>( dx, + dpi, m_pi_init, iend ); cudaErrchk( cudaGetLastError() ); @@ -115,7 +115,7 @@ void PI_REDUCE::runCudaVariant(VariantID vid) stopTimer(); } else { - std::cout << "\n PI_REDUCE : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n PI_REDUCE : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index 41a0a8ae9..a7a22482d 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -38,7 +38,7 @@ __global__ void pi_reduce(Real_type dx, ppi[ threadIdx.x ] = pi_init; for ( ; i < iend ; i += gridDim.x * blockDim.x ) { double x = (double(i) + 0.5) * dx; - ppi[ threadIdx.x ] += dx / (1.0 + x * x); + ppi[ threadIdx.x ] += dx / (1.0 + x * x); } __syncthreads(); @@ -57,7 +57,7 @@ __global__ void pi_reduce(Real_type dx, if ( threadIdx.x == 0 ) i{ *dpi += ppi[ 0 ]; } -#endif +#endif } @@ -80,7 +80,7 @@ void PI_REDUCE::runHipVariant(VariantID vid) initHipDeviceData(dpi, &m_pi_init, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL( (pi_reduce), dim3(grid_size), dim3(block_size), + hipLaunchKernelGGL( (pi_reduce), dim3(grid_size), dim3(block_size), sizeof(Real_type)*block_size, 0, dx, dpi, m_pi_init, iend ); hipErrchk( hipGetLastError() ); @@ -114,7 +114,7 @@ void PI_REDUCE::runHipVariant(VariantID vid) stopTimer(); } else { - std::cout << "\n PI_REDUCE : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n PI_REDUCE : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/basic/PI_REDUCE-OMP.cpp b/src/basic/PI_REDUCE-OMP.cpp index 3261b8e61..56a7c59cd 100644 --- a/src/basic/PI_REDUCE-OMP.cpp +++ b/src/basic/PI_REDUCE-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -36,7 +36,7 @@ void PI_REDUCE::runOpenMPVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { Real_type pi = m_pi_init; - + #pragma omp parallel for reduction(+:pi) for (Index_type i = ibegin; i < iend; ++i ) { PI_REDUCE_BODY; @@ -79,11 +79,11 @@ void PI_REDUCE::runOpenMPVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum pi(m_pi_init); - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), + RAJA::ReduceSum pi(m_pi_init); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { PI_REDUCE_BODY; }); @@ -97,12 +97,12 @@ void PI_REDUCE::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n PI_REDUCE : Unknown variant id = " << vid << std::endl; + getCout() << "\n PI_REDUCE : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/basic/PI_REDUCE-OMPTarget.cpp b/src/basic/PI_REDUCE-OMPTarget.cpp index 60eaa4a84..4f4870b5c 100644 --- a/src/basic/PI_REDUCE-OMPTarget.cpp +++ b/src/basic/PI_REDUCE-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -43,8 +43,8 @@ void PI_REDUCE::runOpenMPTargetVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { Real_type pi = m_pi_init; - - #pragma omp target device( did ) map(tofrom:pi) + + #pragma omp target device( did ) map(tofrom:pi) #pragma omp teams distribute parallel for reduction(+:pi) \ thread_limit(threads_per_team) schedule(static, 1) for (Index_type i = ibegin; i < iend; ++i ) { @@ -60,11 +60,11 @@ void PI_REDUCE::runOpenMPTargetVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum pi(m_pi_init); + + RAJA::ReduceSum pi(m_pi_init); RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { PI_REDUCE_BODY; }); @@ -75,7 +75,7 @@ void PI_REDUCE::runOpenMPTargetVariant(VariantID vid) stopTimer(); } else { - std::cout << "\n PI_REDUCE : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n PI_REDUCE : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/basic/PI_REDUCE-Seq.cpp b/src/basic/PI_REDUCE-Seq.cpp index 70ee92f79..577ebfb6b 100644 --- a/src/basic/PI_REDUCE-Seq.cpp +++ b/src/basic/PI_REDUCE-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -76,10 +76,10 @@ void PI_REDUCE::runSeqVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum pi(m_pi_init); - RAJA::forall( RAJA::RangeSegment(ibegin, iend), + RAJA::ReduceSum pi(m_pi_init); + + RAJA::forall( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { PI_REDUCE_BODY; }); @@ -94,7 +94,7 @@ void PI_REDUCE::runSeqVariant(VariantID vid) #endif default : { - std::cout << "\n PI_REDUCE : Unknown variant id = " << vid << std::endl; + getCout() << "\n PI_REDUCE : Unknown variant id = " << vid << std::endl; } } diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index 50481f5b2..a4bba12c6 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -159,7 +159,7 @@ void REDUCE3_INT::runCudaVariant(VariantID vid) REDUCE3_INT_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n REDUCE3_INT : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n REDUCE3_INT : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index ba13fa8af..c8955fc18 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -159,7 +159,7 @@ void REDUCE3_INT::runHipVariant(VariantID vid) REDUCE3_INT_DATA_TEARDOWN_HIP; } else { - std::cout << "\n REDUCE3_INT : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n REDUCE3_INT : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/basic/REDUCE3_INT-OMP.cpp b/src/basic/REDUCE3_INT-OMP.cpp index f0e853f0a..bb22a3ce8 100644 --- a/src/basic/REDUCE3_INT-OMP.cpp +++ b/src/basic/REDUCE3_INT-OMP.cpp @@ -13,7 +13,7 @@ #include #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -82,7 +82,7 @@ void REDUCE3_INT::runOpenMPVariant(VariantID vid) m_vsum += vsum; m_vmin = RAJA_MIN(m_vmin, vmin); m_vmax = RAJA_MAX(m_vmax, vmax); - + } stopTimer(); @@ -93,7 +93,7 @@ void REDUCE3_INT::runOpenMPVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - + RAJA::ReduceSum vsum(m_vsum_init); RAJA::ReduceMin vmin(m_vmin_init); RAJA::ReduceMax vmax(m_vmax_init); @@ -114,12 +114,12 @@ void REDUCE3_INT::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n REDUCE3_INT : Unknown variant id = " << vid << std::endl; + getCout() << "\n REDUCE3_INT : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/basic/REDUCE3_INT-OMPTarget.cpp b/src/basic/REDUCE3_INT-OMPTarget.cpp index ef11b6f5d..08f184510 100644 --- a/src/basic/REDUCE3_INT-OMPTarget.cpp +++ b/src/basic/REDUCE3_INT-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -100,7 +100,7 @@ void REDUCE3_INT::runOpenMPTargetVariant(VariantID vid) REDUCE3_INT_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n REDUCE3_INT : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n REDUCE3_INT : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/basic/REDUCE3_INT-Seq.cpp b/src/basic/REDUCE3_INT-Seq.cpp index 04760c9a5..7170cecd7 100644 --- a/src/basic/REDUCE3_INT-Seq.cpp +++ b/src/basic/REDUCE3_INT-Seq.cpp @@ -13,7 +13,7 @@ #include #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -108,7 +108,7 @@ void REDUCE3_INT::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n REDUCE3_INT : Unknown variant id = " << vid << std::endl; + getCout() << "\n REDUCE3_INT : Unknown variant id = " << vid << std::endl; } } diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index 305104c4a..21b95fc81 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -149,7 +149,7 @@ void TRAP_INT::runCudaVariant(VariantID vid) TRAP_INT_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n TRAP_INT : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n TRAP_INT : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index 40e6158bb..b378a5504 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -148,7 +148,7 @@ void TRAP_INT::runHipVariant(VariantID vid) TRAP_INT_DATA_TEARDOWN_HIP; } else { - std::cout << "\n TRAP_INT : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n TRAP_INT : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/basic/TRAP_INT-OMP.cpp b/src/basic/TRAP_INT-OMP.cpp index cfc449c77..19017fa76 100644 --- a/src/basic/TRAP_INT-OMP.cpp +++ b/src/basic/TRAP_INT-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -110,12 +110,12 @@ void TRAP_INT::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n TRAP_INT : Unknown variant id = " << vid << std::endl; + getCout() << "\n TRAP_INT : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/basic/TRAP_INT-OMPTarget.cpp b/src/basic/TRAP_INT-OMPTarget.cpp index 636f4090a..ad5717923 100644 --- a/src/basic/TRAP_INT-OMPTarget.cpp +++ b/src/basic/TRAP_INT-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -66,8 +66,8 @@ void TRAP_INT::runOpenMPTargetVariant(VariantID vid) Real_type sumx = m_sumx_init; #pragma omp target teams distribute parallel for map(tofrom: sumx) reduction(+:sumx) \ - thread_limit(threads_per_team) schedule(static, 1) - + thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { TRAP_INT_BODY; } @@ -77,7 +77,7 @@ void TRAP_INT::runOpenMPTargetVariant(VariantID vid) } stopTimer(); - #pragma omp target exit data map(delete: x0,xp,y,yp,h) + #pragma omp target exit data map(delete: x0,xp,y,yp,h) } else if ( vid == RAJA_OpenMPTarget ) { @@ -101,7 +101,7 @@ void TRAP_INT::runOpenMPTargetVariant(VariantID vid) TRAP_INT_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n TRAP_INT : Unknown OMP Targetvariant id = " << vid << std::endl; + getCout() << "\n TRAP_INT : Unknown OMP Targetvariant id = " << vid << std::endl; } } diff --git a/src/basic/TRAP_INT-Seq.cpp b/src/basic/TRAP_INT-Seq.cpp index a1d657392..ba411e513 100644 --- a/src/basic/TRAP_INT-Seq.cpp +++ b/src/basic/TRAP_INT-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace basic { @@ -108,7 +108,7 @@ void TRAP_INT::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n TRAP_INT : Unknown variant id = " << vid << std::endl; + getCout() << "\n TRAP_INT : Unknown variant id = " << vid << std::endl; } } diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index 41b089fa3..49da5c23d 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -50,8 +50,8 @@ void allocAndInitData(Int_ptr& ptr, int len, VariantID vid) */ void allocAndInitData(Real_ptr& ptr, int len, VariantID vid ) { - ptr = - RAJA::allocate_aligned_type(RAJA::DATA_ALIGN, + ptr = + RAJA::allocate_aligned_type(RAJA::DATA_ALIGN, len*sizeof(Real_type)); initData(ptr, len, vid); } @@ -61,8 +61,8 @@ void allocAndInitDataConst(Real_ptr& ptr, int len, Real_type val, { (void) vid; - ptr = - RAJA::allocate_aligned_type(RAJA::DATA_ALIGN, + ptr = + RAJA::allocate_aligned_type(RAJA::DATA_ALIGN, len*sizeof(Real_type)); initDataConst(ptr, len, val, vid); } @@ -95,7 +95,7 @@ void allocAndInitData(Complex_ptr& ptr, int len, VariantID vid) * Free data arrays of given type. */ void deallocData(Int_ptr& ptr) -{ +{ if (ptr) { delete [] ptr; ptr = 0; @@ -103,7 +103,7 @@ void deallocData(Int_ptr& ptr) } void deallocData(Real_ptr& ptr) -{ +{ if (ptr) { RAJA::free_aligned(ptr); ptr = 0; @@ -112,7 +112,7 @@ void deallocData(Real_ptr& ptr) void deallocData(Complex_ptr& ptr) { - if (ptr) { + if (ptr) { delete [] ptr; ptr = 0; } @@ -120,7 +120,7 @@ void deallocData(Complex_ptr& ptr) /* - * \brief Initialize Int_type data array to + * \brief Initialize Int_type data array to * randomly signed positive and negative values. */ void initData(Int_ptr& ptr, int len, VariantID vid) @@ -148,11 +148,11 @@ void initData(Int_ptr& ptr, int len, VariantID vid) ptr[i] = ( signfact < 0.5 ? -1 : 1 ); }; - signfact = Real_type(rand())/RAND_MAX; + signfact = Real_type(rand())/RAND_MAX; Int_type ilo = len * signfact; ptr[ilo] = -58; - signfact = Real_type(rand())/RAND_MAX; + signfact = Real_type(rand())/RAND_MAX; Int_type ihi = len * signfact; ptr[ihi] = 19; @@ -160,11 +160,11 @@ void initData(Int_ptr& ptr, int len, VariantID vid) } /* - * Initialize Real_type data array to non-random - * positive values (0.0, 1.0) based on their array position + * Initialize Real_type data array to non-random + * positive values (0.0, 1.0) based on their array position * (index) and the order in which this method is called. */ -void initData(Real_ptr& ptr, int len, VariantID vid) +void initData(Real_ptr& ptr, int len, VariantID vid) { (void) vid; @@ -172,19 +172,19 @@ void initData(Real_ptr& ptr, int len, VariantID vid) // first touch... #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - if ( vid == Base_OpenMP || + if ( vid == Base_OpenMP || vid == Lambda_OpenMP || vid == RAJA_OpenMP ) { #pragma omp parallel for - for (int i = 0; i < len; ++i) { + for (int i = 0; i < len; ++i) { ptr[i] = factor*(i + 1.1)/(i + 1.12345); }; - } + } #endif for (int i = 0; i < len; ++i) { ptr[i] = factor*(i + 1.1)/(i + 1.12345); - } + } incDataInitCount(); } @@ -193,7 +193,7 @@ void initData(Real_ptr& ptr, int len, VariantID vid) * Initialize Real_type data array to constant values. */ void initDataConst(Real_ptr& ptr, int len, Real_type val, - VariantID vid) + VariantID vid) { // first touch... @@ -289,10 +289,10 @@ void initData(Complex_ptr& ptr, int len, VariantID vid) #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) if ( vid == Base_OpenMP || - vid == Lambda_OpenMP || + vid == Lambda_OpenMP || vid == RAJA_OpenMP ) { #pragma omp parallel for - for (int i = 0; i < len; ++i) { + for (int i = 0; i < len; ++i) { ptr[i] = factor*(i + 1.1)/(i + 1.12345); }; } @@ -322,7 +322,7 @@ void initData(Real_type& d, VariantID vid) /* * Calculate and return checksum for data arrays. */ -long double calcChecksum(const Real_ptr ptr, int len, +long double calcChecksum(const Real_ptr ptr, int len, Real_type scale_factor) { long double tchk = 0.0; @@ -330,7 +330,7 @@ long double calcChecksum(const Real_ptr ptr, int len, tchk += (j+1)*ptr[j]*scale_factor; #if 0 // RDH DEBUG if ( (j % 100) == 0 ) { - std::cout << "j : tchk = " << j << " : " << tchk << std::endl; + getCout() << "j : tchk = " << j << " : " << tchk << std::endl; } #endif } @@ -345,7 +345,7 @@ long double calcChecksum(const Complex_ptr ptr, int len, tchk += (j+1)*(real(ptr[j])+imag(ptr[j]))*scale_factor; #if 0 // RDH DEBUG if ( (j % 100) == 0 ) { - std::cout << "j : tchk = " << j << " : " << tchk << std::endl; + getCout() << "j : tchk = " << j << " : " << tchk << std::endl; } #endif } diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index cc32c6cf6..7e3893d98 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -56,7 +56,7 @@ void Executor::setupSuite() return; } - cout << "\nSetting up suite based on input..." << endl; + getCout() << "\nSetting up suite based on input..." << endl; using Slist = list; using Svector = vector; @@ -704,7 +704,7 @@ void Executor::runSuite() return; } - cout << "\n\nRun warmup kernels...\n"; + getCout() << "\n\nRun warmup kernels...\n"; vector warmup_kernels; @@ -714,16 +714,16 @@ void Executor::runSuite() for (size_t ik = 0; ik < warmup_kernels.size(); ++ik) { KernelBase* warmup_kernel = warmup_kernels[ik]; - cout << "Kernel : " << warmup_kernel->getName() << endl; + getCout() << "Kernel : " << warmup_kernel->getName() << endl; for (size_t iv = 0; iv < variant_ids.size(); ++iv) { VariantID vid = variant_ids[iv]; if ( run_params.showProgress() ) { if ( warmup_kernel->hasVariantDefined(vid) ) { - cout << " Running "; + getCout() << " Running "; } else { - cout << " No "; + getCout() << " No "; } - cout << getVariantName(vid) << " variant" << endl; + getCout() << getVariantName(vid) << " variant" << endl; } if ( warmup_kernel->hasVariantDefined(vid) ) { warmup_kernel->execute(vid); @@ -733,18 +733,18 @@ void Executor::runSuite() } - cout << "\n\nRunning specified kernels and variants...\n"; + getCout() << "\n\nRunning specified kernels and variants...\n"; const int npasses = run_params.getNumPasses(); for (int ip = 0; ip < npasses; ++ip) { if ( run_params.showProgress() ) { - std::cout << "\nPass through suite # " << ip << "\n"; + getCout() << "\nPass through suite # " << ip << "\n"; } for (size_t ik = 0; ik < kernels.size(); ++ik) { KernelBase* kernel = kernels[ik]; if ( run_params.showProgress() ) { - std::cout << "\nRun kernel -- " << kernel->getName() << "\n"; + getCout() << "\nRun kernel -- " << kernel->getName() << "\n"; } for (size_t iv = 0; iv < variant_ids.size(); ++iv) { @@ -752,11 +752,11 @@ void Executor::runSuite() KernelBase* kern = kernels[ik]; if ( run_params.showProgress() ) { if ( kern->hasVariantDefined(vid) ) { - cout << " Running "; + getCout() << " Running "; } else { - cout << " No "; + getCout() << " No "; } - cout << getVariantName(vid) << " variant" << endl; + getCout() << getVariantName(vid) << " variant" << endl; } if ( kern->hasVariantDefined(vid) ) { kernels[ik]->execute(vid); @@ -777,7 +777,7 @@ void Executor::outputRunData() return; } - cout << "\n\nGenerate run report files...\n"; + getCout() << "\n\nGenerate run report files...\n"; // // Generate output file prefix (including directory path). @@ -1245,7 +1245,7 @@ string Executor::getReportTitle(CSVRepMode mode) } break; } - default : { cout << "\n Unknown CSV report mode = " << mode << endl; } + default : { getCout() << "\n Unknown CSV report mode = " << mode << endl; } }; return title; } @@ -1269,8 +1269,8 @@ long double Executor::getReportDataEntry(CSVRepMode mode, retval = 0.0; } #if 0 // RDH DEBUG (leave this here, it's useful for debugging!) - cout << "Kernel(iv): " << kern->getName() << "(" << vid << ")" << endl; - cout << "\tref_time, tot_time, retval = " + getCout() << "Kernel(iv): " << kern->getName() << "(" << vid << ")" << endl; + getCout() << "\tref_time, tot_time, retval = " << kern->getTotTime(reference_vid) << " , " << kern->getTotTime(vid) << " , " << retval << endl; @@ -1278,7 +1278,7 @@ long double Executor::getReportDataEntry(CSVRepMode mode, } break; } - default : { cout << "\n Unknown CSV report mode = " << mode << endl; } + default : { getCout() << "\n Unknown CSV report mode = " << mode << endl; } }; return retval; } @@ -1315,12 +1315,12 @@ void Executor::getFOMGroups(vector& fom_groups) } // iterate over variant ids to run #if 0 // RDH DEBUG (leave this here, it's useful for debugging!) - cout << "\nFOMGroups..." << endl; + getCout() << "\nFOMGroups..." << endl; for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { const FOMGroup& group = fom_groups[ifg]; - cout << "\tBase : " << getVariantName(group.base) << endl; + getCout() << "\tBase : " << getVariantName(group.base) << endl; for (size_t iv = 0; iv < group.variants.size(); ++iv) { - cout << "\t\t " << getVariantName(group.variants[iv]) << endl; + getCout() << "\t\t " << getVariantName(group.variants[iv]) << endl; } } #endif diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 69d195700..ea21e5e68 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -15,7 +15,7 @@ namespace rajaperf { KernelBase::KernelBase(KernelID kid, const RunParams& params) : - run_params(params) + run_params(params) { kernel_id = kid; name = getFullKernelName(kernel_id); @@ -24,7 +24,7 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params) : default_reps = -1; actual_prob_size = -1; - + for (size_t fid = 0; fid < NumFeatures; ++fid) { uses_feature[fid] = false; } @@ -51,17 +51,17 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params) : } } - + KernelBase::~KernelBase() { } Index_type KernelBase::getTargetProblemSize() const -{ +{ Index_type target_size = static_cast(0); if (run_params.getSizeMeaning() == RunParams::SizeMeaning::Factor) { - target_size = + target_size = static_cast(default_prob_size*run_params.getSizeFactor()); } else if (run_params.getSizeMeaning() == RunParams::SizeMeaning::Direct) { target_size = static_cast(run_params.getSize()); @@ -70,23 +70,23 @@ Index_type KernelBase::getTargetProblemSize() const } Index_type KernelBase::getRunReps() const -{ +{ Index_type run_reps = static_cast(0); if (run_params.getInputState() == RunParams::CheckRun) { run_reps = static_cast(run_params.getCheckRunReps()); } else { - run_reps = static_cast(default_reps*run_params.getRepFactor()); + run_reps = static_cast(default_reps*run_params.getRepFactor()); } return run_reps; } -void KernelBase::setVariantDefined(VariantID vid) +void KernelBase::setVariantDefined(VariantID vid) { - has_variant_defined[vid] = isVariantAvailable(vid); + has_variant_defined[vid] = isVariantAvailable(vid); } -void KernelBase::execute(VariantID vid) +void KernelBase::execute(VariantID vid) { running_variant = vid; @@ -94,14 +94,14 @@ void KernelBase::execute(VariantID vid) resetDataInitCount(); this->setUp(vid); - - this->runKernel(vid); - this->updateChecksum(vid); + this->runKernel(vid); + + this->updateChecksum(vid); this->tearDown(vid); - running_variant = NumVariants; + running_variant = NumVariants; } void KernelBase::recordExecTime() @@ -178,7 +178,7 @@ void KernelBase::runKernel(VariantID vid) default : { #if 0 - std::cout << "\n " << getName() + getCout() << "\n " << getName() << " : Unknown variant id = " << vid << std::endl; #endif } @@ -195,13 +195,13 @@ void KernelBase::print(std::ostream& os) const os << "\t\t\t actual_prob_size = " << actual_prob_size << std::endl; os << "\t\t\t uses_feature: " << std::endl; for (unsigned j = 0; j < NumFeatures; ++j) { - os << "\t\t\t\t" << getFeatureName(static_cast(j)) - << " : " << uses_feature[j] << std::endl; + os << "\t\t\t\t" << getFeatureName(static_cast(j)) + << " : " << uses_feature[j] << std::endl; } os << "\t\t\t has_variant_defined: " << std::endl; for (unsigned j = 0; j < NumVariants; ++j) { - os << "\t\t\t\t" << getVariantName(static_cast(j)) - << " : " << has_variant_defined[j] << std::endl; + os << "\t\t\t\t" << getVariantName(static_cast(j)) + << " : " << has_variant_defined[j] << std::endl; } os << "\t\t\t its_per_rep = " << its_per_rep << std::endl; os << "\t\t\t kernels_per_rep = " << kernels_per_rep << std::endl; @@ -209,28 +209,28 @@ void KernelBase::print(std::ostream& os) const os << "\t\t\t FLOPs_per_rep = " << FLOPs_per_rep << std::endl; os << "\t\t\t num_exec: " << std::endl; for (unsigned j = 0; j < NumVariants; ++j) { - os << "\t\t\t\t" << getVariantName(static_cast(j)) - << " : " << num_exec[j] << std::endl; + os << "\t\t\t\t" << getVariantName(static_cast(j)) + << " : " << num_exec[j] << std::endl; } os << "\t\t\t min_time: " << std::endl; for (unsigned j = 0; j < NumVariants; ++j) { - os << "\t\t\t\t" << getVariantName(static_cast(j)) - << " : " << min_time[j] << std::endl; + os << "\t\t\t\t" << getVariantName(static_cast(j)) + << " : " << min_time[j] << std::endl; } os << "\t\t\t max_time: " << std::endl; for (unsigned j = 0; j < NumVariants; ++j) { - os << "\t\t\t\t" << getVariantName(static_cast(j)) - << " : " << max_time[j] << std::endl; + os << "\t\t\t\t" << getVariantName(static_cast(j)) + << " : " << max_time[j] << std::endl; } os << "\t\t\t tot_time: " << std::endl; for (unsigned j = 0; j < NumVariants; ++j) { - os << "\t\t\t\t" << getVariantName(static_cast(j)) - << " : " << tot_time[j] << std::endl; + os << "\t\t\t\t" << getVariantName(static_cast(j)) + << " : " << tot_time[j] << std::endl; } os << "\t\t\t checksum: " << std::endl; for (unsigned j = 0; j < NumVariants; ++j) { - os << "\t\t\t\t" << getVariantName(static_cast(j)) - << " : " << checksum[j] << std::endl; + os << "\t\t\t\t" << getVariantName(static_cast(j)) + << " : " << checksum[j] << std::endl; } os << std::endl; } diff --git a/src/common/OutputUtils.cpp b/src/common/OutputUtils.cpp index 0696b078f..5aea14855 100644 --- a/src/common/OutputUtils.cpp +++ b/src/common/OutputUtils.cpp @@ -6,6 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +#include "RAJAPerfSuite.hpp" #include "OutputUtils.hpp" #include @@ -72,7 +73,7 @@ std::string recursiveMkdir(const std::string& in_path) */ if (pos >= 0) { if (!S_ISDIR(status.st_mode)) { - std::cout << "Cannot create directories in path = " << path + getCout() << "Cannot create directories in path = " << path << "\n because some intermediate item in path exists and" << "is NOT a directory" << std::endl; outpath = std::string(); @@ -88,7 +89,7 @@ std::string recursiveMkdir(const std::string& in_path) */ if ( !outpath.empty() && pos < 0) { if (mkdir(path_buf, mode) != 0) { - std::cout << " Cannot create directory = " + getCout() << " Cannot create directory = " << path_buf << std::endl; outpath = std::string(); } @@ -113,7 +114,7 @@ std::string recursiveMkdir(const std::string& in_path) /* make directory if not at end of path */ if (pos < length) { if (mkdir(path_buf, mode) != 0) { - std::cout << " Cannot create directory = " + getCout() << " Cannot create directory = " << path_buf << std::endl; outpath = std::string(); } diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 748fb1325..f6a8a8662 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -10,6 +10,10 @@ #include "RunParams.hpp" +#ifdef RAJA_PERFSUITE_ENABLE_MPI +#include +#endif + // // Basic kernels... // @@ -348,7 +352,7 @@ const std::string& getVariantName(VariantID vid) /*! ******************************************************************************* * - * Return true if variant associated with VariantID enum value is available + * Return true if variant associated with VariantID enum value is available * to run; else false. * ******************************************************************************* @@ -361,22 +365,22 @@ bool isVariantAvailable(VariantID vid) ret_val = true; } #if defined(RUN_RAJA_SEQ) - if ( vid == Lambda_Seq || + if ( vid == Lambda_Seq || vid == RAJA_Seq ) { ret_val = true; } #endif #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - if ( vid == Base_OpenMP || - vid == Lambda_OpenMP || + if ( vid == Base_OpenMP || + vid == Lambda_OpenMP || vid == RAJA_OpenMP ) { ret_val = true; } #endif #if defined(RAJA_ENABLE_TARGET_OPENMP) - if ( vid == Base_OpenMPTarget || + if ( vid == Base_OpenMPTarget || vid == RAJA_OpenMPTarget ) { ret_val = true; } @@ -672,7 +676,7 @@ KernelBase* getKernelObject(KernelID kid, } default: { - std::cout << "\n Unknown Kernel ID = " << kid << std::endl; + getCout() << "\n Unknown Kernel ID = " << kid << std::endl; } } // end switch on kernel id @@ -680,4 +684,39 @@ KernelBase* getKernelObject(KernelID kid, return kernel; } +// subclass of streambuf that ignores overflow +// never printing anything to the underlying stream +struct NullStream : std::streambuf, std::ostream +{ + using Base = std::streambuf; + using int_type = typename Base::int_type; + + NullStream() : std::ostream(this) {} +public: + int_type overflow(int_type c) override { return c; } +}; + +std::ostream* makeNullStream() +{ + return new NullStream(); +} + +std::ostream& getNullStream() +{ + static NullStream null_stream; + return null_stream; +} + +std::ostream& getCout() +{ + int rank = 0; +#ifdef RAJA_PERFSUITE_ENABLE_MPI + MPI_Comm_rank(MPI_COMM_WORLD, &rank); +#endif + if (rank == 0) { + return std::cout; + } + return getNullStream(); +} + } // closing brace for rajaperf namespace diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 4e4d38a66..efa13b41b 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -17,6 +17,7 @@ #include "rajaperf_config.hpp" #include +#include namespace rajaperf { @@ -32,8 +33,8 @@ class RunParams; * * IMPORTANT: This is only modified when a group is added or removed. * - * ENUM VALUES MUST BE KEPT CONSISTENT (CORRESPONDING ONE-TO-ONE) - * WITH ARRAY OF GROUP NAMES IN IMPLEMENTATION FILE!!! + * ENUM VALUES MUST BE KEPT CONSISTENT (CORRESPONDING ONE-TO-ONE) + * WITH ARRAY OF GROUP NAMES IN IMPLEMENTATION FILE!!! * ******************************************************************************* */ @@ -59,8 +60,8 @@ enum GroupID { * * IMPORTANT: This is only modified when a kernel is added or removed. * - * ENUM VALUES MUST BE KEPT CONSISTENT (CORRESPONDING ONE-TO-ONE) - * WITH ARRAY OF KERNEL NAMES IN IMPLEMENTATION FILE!!! + * ENUM VALUES MUST BE KEPT CONSISTENT (CORRESPONDING ONE-TO-ONE) + * WITH ARRAY OF KERNEL NAMES IN IMPLEMENTATION FILE!!! * ******************************************************************************* */ @@ -158,7 +159,7 @@ enum KernelID { * IMPORTANT: This is only modified when a new variant is added to the suite. * * IT MUST BE KEPT CONSISTENT (CORRESPONDING ONE-TO-ONE) WITH - * ARRAY OF VARIANT NAMES IN IMPLEMENTATION FILE!!! + * ARRAY OF VARIANT NAMES IN IMPLEMENTATION FILE!!! * ******************************************************************************* */ @@ -208,7 +209,7 @@ enum FeatureID { Sort, Scan, - Workgroup, + Workgroup, Reduction, Atomic, @@ -258,12 +259,12 @@ const std::string& getFullKernelName(KernelID kid); * ******************************************************************************* */ -const std::string& getVariantName(VariantID vid); +const std::string& getVariantName(VariantID vid); /*! ******************************************************************************* * - * \brief Return true if variant associated with VariantID enum value is + * \brief Return true if variant associated with VariantID enum value is * available * to run; else false. * ******************************************************************************* @@ -290,6 +291,35 @@ const std::string& getFeatureName(FeatureID vid); */ KernelBase* getKernelObject(KernelID kid, const RunParams& run_params); +/*! + ******************************************************************************* + * + * \brief Return ostream used as cout. + * + * IMPORTANT: May return a non-printing stream when MPI is enabled. + * + ******************************************************************************* + */ +std::ostream& getCout(); + +/*! + ******************************************************************************* + * + * \brief Return non-printing ostream. + * + ******************************************************************************* + */ +std::ostream* makeNullStream(); + +/*! + ******************************************************************************* + * + * \brief Return reference to global non-printing ostream. + * + ******************************************************************************* + */ +std::ostream& getNullStream(); + } // closing brace for rajaperf namespace #endif // closing endif for header file include guard diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index e038863c1..98b23da47 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -156,7 +156,7 @@ void RunParams::print(std::ostream& str) const */ void RunParams::parseCommandLineOptions(int argc, char** argv) { - std::cout << "\n\nReading command line input..." << std::endl; + getCout() << "\n\nReading command line input..." << std::endl; for (int i = 1; i < argc; ++i) { @@ -165,7 +165,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) if ( opt == std::string("--help") || opt == std::string("-h") ) { - printHelpMessage(std::cout); + printHelpMessage(getCout()); input_state = InfoRequest; } else if ( opt == std::string("--show-progress") || @@ -176,31 +176,31 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } else if ( opt == std::string("--print-kernels") || opt == std::string("-pk") ) { - printFullKernelNames(std::cout); + printFullKernelNames(getCout()); input_state = InfoRequest; } else if ( opt == std::string("--print-variants") || opt == std::string("-pv") ) { - printVariantNames(std::cout); + printVariantNames(getCout()); input_state = InfoRequest; } else if ( opt == std::string("--print-features") || opt == std::string("-pf") ) { - printFeatureNames(std::cout); + printFeatureNames(getCout()); input_state = InfoRequest; } else if ( opt == std::string("--print-feature-kernels") || opt == std::string("-pfk") ) { - printFeatureKernels(std::cout); + printFeatureKernels(getCout()); input_state = InfoRequest; } else if ( opt == std::string("--print-kernel-features") || opt == std::string("-pkf") ) { - printKernelFeatures(std::cout); + printKernelFeatures(getCout()); input_state = InfoRequest; } else if ( opt == std::string("--npasses") ) { @@ -209,7 +209,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) if ( i < argc ) { npasses = ::atoi( argv[i] ); } else { - std::cout << "\nBad input:" + getCout() << "\nBad input:" << " must give --npasses a value for number of passes (int)" << std::endl; input_state = BadInput; @@ -221,7 +221,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) if ( i < argc ) { rep_fact = ::atof( argv[i] ); } else { - std::cout << "\nBad input:" + getCout() << "\nBad input:" << " must give --rep_fact a value (double)" << std::endl; input_state = BadInput; @@ -232,7 +232,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) i++; if ( i < argc ) { if (size_meaning == SizeMeaning::Direct) { - std::cout << "\nBad input:" + getCout() << "\nBad input:" << " may only set one of --size and --sizefact" << std::endl; input_state = BadInput; @@ -241,14 +241,14 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) if ( size_factor >= 0.0 ) { size_meaning = SizeMeaning::Factor; } else { - std::cout << "\nBad input:" + getCout() << "\nBad input:" << " must give --sizefact a POSITIVE value (double)" << std::endl; input_state = BadInput; } } } else { - std::cout << "\nBad input:" + getCout() << "\nBad input:" << " must give --sizefact a value (double)" << std::endl; input_state = BadInput; @@ -259,7 +259,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) i++; if ( i < argc ) { if (size_meaning == SizeMeaning::Factor) { - std::cout << "\nBad input:" + getCout() << "\nBad input:" << " may only set one of --size and --sizefact" << std::endl; input_state = BadInput; @@ -268,14 +268,14 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) if ( size >= 0.0 ) { size_meaning = SizeMeaning::Direct; } else { - std::cout << "\nBad input:" + getCout() << "\nBad input:" << " must give --size a POSITIVE value (double)" << std::endl; input_state = BadInput; } } } else { - std::cout << "\nBad input:" + getCout() << "\nBad input:" << " must give --size a value (int)" << std::endl; input_state = BadInput; @@ -288,7 +288,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) if ( i < argc ) { pf_tol = ::atof( argv[i] ); } else { - std::cout << "\nBad input:" + getCout() << "\nBad input:" << " must give --pass-fail-tol (or -pftol) a value (double)" << std::endl; input_state = BadInput; @@ -455,8 +455,8 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) input_state = BadInput; std::string huh(argv[i]); - std::cout << "\nUnknown option: " << huh << std::endl; - std::cout.flush(); + getCout() << "\nUnknown option: " << huh << std::endl; + getCout().flush(); } diff --git a/src/lcals/DIFF_PREDICT-Cuda.cpp b/src/lcals/DIFF_PREDICT-Cuda.cpp index 7be0908b4..94303ca97 100644 --- a/src/lcals/DIFF_PREDICT-Cuda.cpp +++ b/src/lcals/DIFF_PREDICT-Cuda.cpp @@ -91,7 +91,7 @@ void DIFF_PREDICT::runCudaVariant(VariantID vid) DIFF_PREDICT_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n DIFF_PREDICT : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n DIFF_PREDICT : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/lcals/DIFF_PREDICT-Hip.cpp b/src/lcals/DIFF_PREDICT-Hip.cpp index 8bc38e983..850a3ab1f 100644 --- a/src/lcals/DIFF_PREDICT-Hip.cpp +++ b/src/lcals/DIFF_PREDICT-Hip.cpp @@ -91,7 +91,7 @@ void DIFF_PREDICT::runHipVariant(VariantID vid) DIFF_PREDICT_DATA_TEARDOWN_HIP; } else { - std::cout << "\n DIFF_PREDICT : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n DIFF_PREDICT : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/lcals/DIFF_PREDICT-OMP.cpp b/src/lcals/DIFF_PREDICT-OMP.cpp index 0b50b2bb4..ebe91fe92 100644 --- a/src/lcals/DIFF_PREDICT-OMP.cpp +++ b/src/lcals/DIFF_PREDICT-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -81,12 +81,12 @@ void DIFF_PREDICT::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n DIFF_PREDICT : Unknown variant id = " << vid << std::endl; + getCout() << "\n DIFF_PREDICT : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/lcals/DIFF_PREDICT-OMPTarget.cpp b/src/lcals/DIFF_PREDICT-OMPTarget.cpp index dadb14c81..60e6c45e7 100644 --- a/src/lcals/DIFF_PREDICT-OMPTarget.cpp +++ b/src/lcals/DIFF_PREDICT-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -55,7 +55,7 @@ void DIFF_PREDICT::runOpenMPTargetVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { #pragma omp target is_device_ptr(px, cx) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) for (Index_type i = ibegin; i < iend; ++i ) { DIFF_PREDICT_BODY; } @@ -83,7 +83,7 @@ void DIFF_PREDICT::runOpenMPTargetVariant(VariantID vid) DIFF_PREDICT_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n DIFF_PREDICT : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n DIFF_PREDICT : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/lcals/DIFF_PREDICT-Seq.cpp b/src/lcals/DIFF_PREDICT-Seq.cpp index 7329386eb..bf74477e7 100644 --- a/src/lcals/DIFF_PREDICT-Seq.cpp +++ b/src/lcals/DIFF_PREDICT-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -62,7 +62,7 @@ void DIFF_PREDICT::runSeqVariant(VariantID vid) break; } - + case RAJA_Seq : { startTimer(); @@ -79,7 +79,7 @@ void DIFF_PREDICT::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n DIFF_PREDICT : Unknown variant id = " << vid << std::endl; + getCout() << "\n DIFF_PREDICT : Unknown variant id = " << vid << std::endl; } } diff --git a/src/lcals/EOS-Cuda.cpp b/src/lcals/EOS-Cuda.cpp index c66a99545..ac630c028 100644 --- a/src/lcals/EOS-Cuda.cpp +++ b/src/lcals/EOS-Cuda.cpp @@ -95,7 +95,7 @@ void EOS::runCudaVariant(VariantID vid) EOS_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n EOS : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n EOS : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/lcals/EOS-Hip.cpp b/src/lcals/EOS-Hip.cpp index 53f952a25..36e0daf88 100644 --- a/src/lcals/EOS-Hip.cpp +++ b/src/lcals/EOS-Hip.cpp @@ -95,7 +95,7 @@ void EOS::runHipVariant(VariantID vid) EOS_DATA_TEARDOWN_HIP; } else { - std::cout << "\n EOS : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n EOS : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/lcals/EOS-OMP.cpp b/src/lcals/EOS-OMP.cpp index 9654eef46..c5a8c8490 100644 --- a/src/lcals/EOS-OMP.cpp +++ b/src/lcals/EOS-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -81,12 +81,12 @@ void EOS::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n EOS : Unknown variant id = " << vid << std::endl; + getCout() << "\n EOS : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/lcals/EOS-OMPTarget.cpp b/src/lcals/EOS-OMPTarget.cpp index 189746801..b0f2fe008 100644 --- a/src/lcals/EOS-OMPTarget.cpp +++ b/src/lcals/EOS-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -59,7 +59,7 @@ void EOS::runOpenMPTargetVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { #pragma omp target is_device_ptr(x, y, z, u) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) for (Index_type i = ibegin; i < iend; ++i ) { EOS_BODY; } @@ -79,15 +79,15 @@ void EOS::runOpenMPTargetVariant(VariantID vid) RAJA::forall>( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { EOS_BODY; - }); + }); } stopTimer(); EOS_DATA_TEARDOWN_OMP_TARGET - } else { - std::cout << "\n EOS : Unknown OMP Tagretvariant id = " << vid << std::endl; + } else { + getCout() << "\n EOS : Unknown OMP Tagretvariant id = " << vid << std::endl; } } diff --git a/src/lcals/EOS-Seq.cpp b/src/lcals/EOS-Seq.cpp index a33b776cc..66f308c2b 100644 --- a/src/lcals/EOS-Seq.cpp +++ b/src/lcals/EOS-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -79,7 +79,7 @@ void EOS::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n EOS : Unknown variant id = " << vid << std::endl; + getCout() << "\n EOS : Unknown variant id = " << vid << std::endl; } } diff --git a/src/lcals/FIRST_DIFF-Cuda.cpp b/src/lcals/FIRST_DIFF-Cuda.cpp index f8330fdfc..d24afedc0 100644 --- a/src/lcals/FIRST_DIFF-Cuda.cpp +++ b/src/lcals/FIRST_DIFF-Cuda.cpp @@ -89,7 +89,7 @@ void FIRST_DIFF::runCudaVariant(VariantID vid) FIRST_DIFF_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n FIRST_DIFF : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n FIRST_DIFF : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/lcals/FIRST_DIFF-Hip.cpp b/src/lcals/FIRST_DIFF-Hip.cpp index 4ac557fec..1cdf0cd15 100644 --- a/src/lcals/FIRST_DIFF-Hip.cpp +++ b/src/lcals/FIRST_DIFF-Hip.cpp @@ -89,7 +89,7 @@ void FIRST_DIFF::runHipVariant(VariantID vid) FIRST_DIFF_DATA_TEARDOWN_HIP; } else { - std::cout << "\n FIRST_DIFF : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n FIRST_DIFF : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/lcals/FIRST_DIFF-OMP.cpp b/src/lcals/FIRST_DIFF-OMP.cpp index 1c6287d78..ae2a2e995 100644 --- a/src/lcals/FIRST_DIFF-OMP.cpp +++ b/src/lcals/FIRST_DIFF-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -81,12 +81,12 @@ void FIRST_DIFF::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n FIRST_DIFF : Unknown variant id = " << vid << std::endl; + getCout() << "\n FIRST_DIFF : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/lcals/FIRST_DIFF-OMPTarget.cpp b/src/lcals/FIRST_DIFF-OMPTarget.cpp index 85e022f1c..0688731d4 100644 --- a/src/lcals/FIRST_DIFF-OMPTarget.cpp +++ b/src/lcals/FIRST_DIFF-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -46,17 +46,17 @@ void FIRST_DIFF::runOpenMPTargetVariant(VariantID vid) const Index_type iend = getActualProblemSize(); FIRST_DIFF_DATA_SETUP; - + if ( vid == Base_OpenMPTarget ) { FIRST_DIFF_DATA_SETUP_OMP_TARGET; - + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { #pragma omp target is_device_ptr(x, y) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) - + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { FIRST_DIFF_BODY; } @@ -65,11 +65,11 @@ void FIRST_DIFF::runOpenMPTargetVariant(VariantID vid) stopTimer(); FIRST_DIFF_DATA_TEARDOWN_OMP_TARGET; - + } else if ( vid == RAJA_OpenMPTarget ) { FIRST_DIFF_DATA_SETUP_OMP_TARGET; - + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -82,9 +82,9 @@ void FIRST_DIFF::runOpenMPTargetVariant(VariantID vid) stopTimer(); FIRST_DIFF_DATA_TEARDOWN_OMP_TARGET; - - } else { - std::cout << "\n FIRST_DIFF : Unknown OMP Target variant id = " << vid << std::endl; + + } else { + getCout() << "\n FIRST_DIFF : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/lcals/FIRST_DIFF-Seq.cpp b/src/lcals/FIRST_DIFF-Seq.cpp index 3b6cefb9a..62b43af09 100644 --- a/src/lcals/FIRST_DIFF-Seq.cpp +++ b/src/lcals/FIRST_DIFF-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -79,7 +79,7 @@ void FIRST_DIFF::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n FIRST_DIFF : Unknown variant id = " << vid << std::endl; + getCout() << "\n FIRST_DIFF : Unknown variant id = " << vid << std::endl; } } diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp index ef87159f2..17ef21e59 100644 --- a/src/lcals/FIRST_MIN-Cuda.cpp +++ b/src/lcals/FIRST_MIN-Cuda.cpp @@ -129,7 +129,7 @@ void FIRST_MIN::runCudaVariant(VariantID vid) FIRST_MIN_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n FIRST_MIN : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n FIRST_MIN : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp index 9880927e6..9024275d6 100644 --- a/src/lcals/FIRST_MIN-Hip.cpp +++ b/src/lcals/FIRST_MIN-Hip.cpp @@ -129,7 +129,7 @@ void FIRST_MIN::runHipVariant(VariantID vid) FIRST_MIN_DATA_TEARDOWN_HIP; } else { - std::cout << "\n FIRST_MIN : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n FIRST_MIN : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/lcals/FIRST_MIN-OMP.cpp b/src/lcals/FIRST_MIN-OMP.cpp index 85e2f7db0..9ebc5f326 100644 --- a/src/lcals/FIRST_MIN-OMP.cpp +++ b/src/lcals/FIRST_MIN-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -43,7 +43,7 @@ void FIRST_MIN::runOpenMPVariant(VariantID vid) #pragma omp parallel for reduction(minloc:mymin) for (Index_type i = ibegin; i < iend; ++i ) { - FIRST_MIN_BODY; + FIRST_MIN_BODY; } m_minloc = RAJA_MAX(m_minloc, mymin.loc); @@ -97,7 +97,7 @@ void FIRST_MIN::runOpenMPVariant(VariantID vid) FIRST_MIN_BODY_RAJA; }); - m_minloc = RAJA_MAX(m_minloc, loc.getLoc()); + m_minloc = RAJA_MAX(m_minloc, loc.getLoc()); } stopTimer(); @@ -106,12 +106,12 @@ void FIRST_MIN::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n FIRST_MIN : Unknown variant id = " << vid << std::endl; + getCout() << "\n FIRST_MIN : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/lcals/FIRST_MIN-OMPTarget.cpp b/src/lcals/FIRST_MIN-OMPTarget.cpp index 41ac5f225..274e9affe 100644 --- a/src/lcals/FIRST_MIN-OMPTarget.cpp +++ b/src/lcals/FIRST_MIN-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -44,11 +44,11 @@ void FIRST_MIN::runOpenMPTargetVariant(VariantID vid) const Index_type iend = getActualProblemSize(); FIRST_MIN_DATA_SETUP; - + if ( vid == Base_OpenMPTarget ) { FIRST_MIN_DATA_SETUP_OMP_TARGET; - + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -70,11 +70,11 @@ void FIRST_MIN::runOpenMPTargetVariant(VariantID vid) stopTimer(); FIRST_MIN_DATA_TEARDOWN_OMP_TARGET; - + } else if ( vid == RAJA_OpenMPTarget ) { FIRST_MIN_DATA_SETUP_OMP_TARGET; - + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -92,9 +92,9 @@ void FIRST_MIN::runOpenMPTargetVariant(VariantID vid) stopTimer(); FIRST_MIN_DATA_TEARDOWN_OMP_TARGET; - - } else { - std::cout << "\n FIRST_MIN : Unknown OMP Target variant id = " << vid << std::endl; + + } else { + getCout() << "\n FIRST_MIN : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/lcals/FIRST_MIN-Seq.cpp b/src/lcals/FIRST_MIN-Seq.cpp index 560b78c34..fec75aadc 100644 --- a/src/lcals/FIRST_MIN-Seq.cpp +++ b/src/lcals/FIRST_MIN-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -97,7 +97,7 @@ void FIRST_MIN::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n FIRST_MIN : Unknown variant id = " << vid << std::endl; + getCout() << "\n FIRST_MIN : Unknown variant id = " << vid << std::endl; } } diff --git a/src/lcals/FIRST_SUM-Cuda.cpp b/src/lcals/FIRST_SUM-Cuda.cpp index 55d0e2214..a5198111c 100644 --- a/src/lcals/FIRST_SUM-Cuda.cpp +++ b/src/lcals/FIRST_SUM-Cuda.cpp @@ -89,7 +89,7 @@ void FIRST_SUM::runCudaVariant(VariantID vid) FIRST_SUM_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n FIRST_SUM : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n FIRST_SUM : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/lcals/FIRST_SUM-Hip.cpp b/src/lcals/FIRST_SUM-Hip.cpp index 0f2cb2ede..d94a4b1e3 100644 --- a/src/lcals/FIRST_SUM-Hip.cpp +++ b/src/lcals/FIRST_SUM-Hip.cpp @@ -89,7 +89,7 @@ void FIRST_SUM::runHipVariant(VariantID vid) FIRST_SUM_DATA_TEARDOWN_HIP; } else { - std::cout << "\n FIRST_SUM : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n FIRST_SUM : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/lcals/FIRST_SUM-OMP.cpp b/src/lcals/FIRST_SUM-OMP.cpp index 41e15c1b6..b9905666a 100644 --- a/src/lcals/FIRST_SUM-OMP.cpp +++ b/src/lcals/FIRST_SUM-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -81,12 +81,12 @@ void FIRST_SUM::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n FIRST_SUM : Unknown variant id = " << vid << std::endl; + getCout() << "\n FIRST_SUM : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/lcals/FIRST_SUM-OMPTarget.cpp b/src/lcals/FIRST_SUM-OMPTarget.cpp index 24c183e3c..19344df4c 100644 --- a/src/lcals/FIRST_SUM-OMPTarget.cpp +++ b/src/lcals/FIRST_SUM-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -46,17 +46,17 @@ void FIRST_SUM::runOpenMPTargetVariant(VariantID vid) const Index_type iend = getActualProblemSize(); FIRST_SUM_DATA_SETUP; - + if ( vid == Base_OpenMPTarget ) { FIRST_SUM_DATA_SETUP_OMP_TARGET; - + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { #pragma omp target is_device_ptr(x, y) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) - + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { FIRST_SUM_BODY; } @@ -65,11 +65,11 @@ void FIRST_SUM::runOpenMPTargetVariant(VariantID vid) stopTimer(); FIRST_SUM_DATA_TEARDOWN_OMP_TARGET; - + } else if ( vid == RAJA_OpenMPTarget ) { FIRST_SUM_DATA_SETUP_OMP_TARGET; - + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -82,9 +82,9 @@ void FIRST_SUM::runOpenMPTargetVariant(VariantID vid) stopTimer(); FIRST_SUM_DATA_TEARDOWN_OMP_TARGET; - - } else { - std::cout << "\n FIRST_SUM : Unknown OMP Target variant id = " << vid << std::endl; + + } else { + getCout() << "\n FIRST_SUM : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/lcals/FIRST_SUM-Seq.cpp b/src/lcals/FIRST_SUM-Seq.cpp index 60bb5756d..cbb96a695 100644 --- a/src/lcals/FIRST_SUM-Seq.cpp +++ b/src/lcals/FIRST_SUM-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -79,7 +79,7 @@ void FIRST_SUM::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n FIRST_SUM : Unknown variant id = " << vid << std::endl; + getCout() << "\n FIRST_SUM : Unknown variant id = " << vid << std::endl; } } diff --git a/src/lcals/GEN_LIN_RECUR-Cuda.cpp b/src/lcals/GEN_LIN_RECUR-Cuda.cpp index 53793af75..40b5b3d15 100644 --- a/src/lcals/GEN_LIN_RECUR-Cuda.cpp +++ b/src/lcals/GEN_LIN_RECUR-Cuda.cpp @@ -116,7 +116,7 @@ void GEN_LIN_RECUR::runCudaVariant(VariantID vid) GEN_LIN_RECUR_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n GEN_LIN_RECUR : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n GEN_LIN_RECUR : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/lcals/GEN_LIN_RECUR-Hip.cpp b/src/lcals/GEN_LIN_RECUR-Hip.cpp index 7d96b27f4..bbe147f2e 100644 --- a/src/lcals/GEN_LIN_RECUR-Hip.cpp +++ b/src/lcals/GEN_LIN_RECUR-Hip.cpp @@ -118,7 +118,7 @@ void GEN_LIN_RECUR::runHipVariant(VariantID vid) GEN_LIN_RECUR_DATA_TEARDOWN_HIP; } else { - std::cout << "\n GEN_LIN_RECUR : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n GEN_LIN_RECUR : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/lcals/GEN_LIN_RECUR-OMP.cpp b/src/lcals/GEN_LIN_RECUR-OMP.cpp index c70bdef44..087fc82f3 100644 --- a/src/lcals/GEN_LIN_RECUR-OMP.cpp +++ b/src/lcals/GEN_LIN_RECUR-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -95,12 +95,12 @@ void GEN_LIN_RECUR::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n GEN_LIN_RECUR : Unknown variant id = " << vid << std::endl; + getCout() << "\n GEN_LIN_RECUR : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp b/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp index e72163afb..7cb0fd72b 100644 --- a/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp +++ b/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -57,7 +57,7 @@ void GEN_LIN_RECUR::runOpenMPTargetVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { #pragma omp target is_device_ptr(b5, stb5, sa, sb) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) for (Index_type k = 0; k < N; ++k ) { GEN_LIN_RECUR_BODY1; } @@ -95,8 +95,8 @@ void GEN_LIN_RECUR::runOpenMPTargetVariant(VariantID vid) GEN_LIN_RECUR_DATA_TEARDOWN_OMP_TARGET - } else { - std::cout << "\n GEN_LIN_RECUR : Unknown OMP Tagretvariant id = " << vid << std::endl; + } else { + getCout() << "\n GEN_LIN_RECUR : Unknown OMP Tagretvariant id = " << vid << std::endl; } } diff --git a/src/lcals/GEN_LIN_RECUR-Seq.cpp b/src/lcals/GEN_LIN_RECUR-Seq.cpp index e6f2233b3..f7ed58734 100644 --- a/src/lcals/GEN_LIN_RECUR-Seq.cpp +++ b/src/lcals/GEN_LIN_RECUR-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -91,7 +91,7 @@ void GEN_LIN_RECUR::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n GEN_LIN_RECUR : Unknown variant id = " << vid << std::endl; + getCout() << "\n GEN_LIN_RECUR : Unknown variant id = " << vid << std::endl; } } diff --git a/src/lcals/HYDRO_1D-Cuda.cpp b/src/lcals/HYDRO_1D-Cuda.cpp index 74f102f5f..4d71069e1 100644 --- a/src/lcals/HYDRO_1D-Cuda.cpp +++ b/src/lcals/HYDRO_1D-Cuda.cpp @@ -93,7 +93,7 @@ void HYDRO_1D::runCudaVariant(VariantID vid) HYDRO_1D_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n HYDRO_1D : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n HYDRO_1D : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/lcals/HYDRO_1D-Hip.cpp b/src/lcals/HYDRO_1D-Hip.cpp index 6c06b2de0..93f3f5042 100644 --- a/src/lcals/HYDRO_1D-Hip.cpp +++ b/src/lcals/HYDRO_1D-Hip.cpp @@ -93,7 +93,7 @@ void HYDRO_1D::runHipVariant(VariantID vid) HYDRO_1D_DATA_TEARDOWN_HIP; } else { - std::cout << "\n HYDRO_1D : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n HYDRO_1D : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/lcals/HYDRO_1D-OMP.cpp b/src/lcals/HYDRO_1D-OMP.cpp index f6e0e2277..94390485f 100644 --- a/src/lcals/HYDRO_1D-OMP.cpp +++ b/src/lcals/HYDRO_1D-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -82,12 +82,12 @@ void HYDRO_1D::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n HYDRO_1D : Unknown variant id = " << vid << std::endl; + getCout() << "\n HYDRO_1D : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/lcals/HYDRO_1D-OMPTarget.cpp b/src/lcals/HYDRO_1D-OMPTarget.cpp index 9adafba0a..c313951b5 100644 --- a/src/lcals/HYDRO_1D-OMPTarget.cpp +++ b/src/lcals/HYDRO_1D-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -57,7 +57,7 @@ void HYDRO_1D::runOpenMPTargetVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { #pragma omp target is_device_ptr(x, y, z) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) for (Index_type i = ibegin; i < iend; ++i ) { HYDRO_1D_BODY; } @@ -85,7 +85,7 @@ void HYDRO_1D::runOpenMPTargetVariant(VariantID vid) HYDRO_1D_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n HYDRO_1D : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n HYDRO_1D : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/lcals/HYDRO_1D-Seq.cpp b/src/lcals/HYDRO_1D-Seq.cpp index cdf086ffe..a257b08bb 100644 --- a/src/lcals/HYDRO_1D-Seq.cpp +++ b/src/lcals/HYDRO_1D-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -79,7 +79,7 @@ void HYDRO_1D::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n HYDRO_1D : Unknown variant id = " << vid << std::endl; + getCout() << "\n HYDRO_1D : Unknown variant id = " << vid << std::endl; } } diff --git a/src/lcals/HYDRO_2D-Cuda.cpp b/src/lcals/HYDRO_2D-Cuda.cpp index 2c0087358..d9afba50d 100644 --- a/src/lcals/HYDRO_2D-Cuda.cpp +++ b/src/lcals/HYDRO_2D-Cuda.cpp @@ -127,7 +127,7 @@ void HYDRO_2D::runCudaVariant(VariantID vid) HYDRO_2D_THREADS_PER_BLOCK_CUDA; HYDRO_2D_NBLOCKS_CUDA; - + hydro_2d1<<>>(zadat, zbdat, zpdat, zqdat, zrdat, zmdat, jn, kn); @@ -159,9 +159,9 @@ void HYDRO_2D::runCudaVariant(VariantID vid) using EXECPOL = RAJA::KernelPolicy< RAJA::statement::CudaKernelFixedAsync, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_y_direct, - RAJA::statement::Tile<1, RAJA::tile_fixed, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_x_direct, RAJA::statement::For<0, RAJA::cuda_thread_y_direct, // k RAJA::statement::For<1, RAJA::cuda_thread_x_direct, // j @@ -172,7 +172,7 @@ void HYDRO_2D::runCudaVariant(VariantID vid) > > >; - + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -203,7 +203,7 @@ void HYDRO_2D::runCudaVariant(VariantID vid) HYDRO_2D_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n HYDRO_2D : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n HYDRO_2D : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/lcals/HYDRO_2D-Hip.cpp b/src/lcals/HYDRO_2D-Hip.cpp index a492999ec..785a94c09 100644 --- a/src/lcals/HYDRO_2D-Hip.cpp +++ b/src/lcals/HYDRO_2D-Hip.cpp @@ -127,14 +127,14 @@ void HYDRO_2D::runHipVariant(VariantID vid) HYDRO_2D_THREADS_PER_BLOCK_HIP; HYDRO_2D_NBLOCKS_HIP; - hipLaunchKernelGGL((hydro_2d1), + hipLaunchKernelGGL((hydro_2d1), dim3(nblocks), dim3(nthreads_per_block), 0, 0, zadat, zbdat, zpdat, zqdat, zrdat, zmdat, jn, kn); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((hydro_2d2), + hipLaunchKernelGGL((hydro_2d2), dim3(nblocks), dim3(nthreads_per_block), 0, 0, zudat, zvdat, zadat, zbdat, zzdat, zrdat, @@ -142,7 +142,7 @@ void HYDRO_2D::runHipVariant(VariantID vid) jn, kn); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((hydro_2d3), + hipLaunchKernelGGL((hydro_2d3), dim3(nblocks), dim3(nthreads_per_block), 0, 0, zroutdat, zzoutdat, zrdat, zudat, zzdat, zvdat, @@ -208,7 +208,7 @@ void HYDRO_2D::runHipVariant(VariantID vid) HYDRO_2D_DATA_TEARDOWN_HIP; } else { - std::cout << "\n HYDRO_2D : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n HYDRO_2D : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/lcals/HYDRO_2D-OMP.cpp b/src/lcals/HYDRO_2D-OMP.cpp index 0c2ce4001..d22502523 100644 --- a/src/lcals/HYDRO_2D-OMP.cpp +++ b/src/lcals/HYDRO_2D-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -147,19 +147,19 @@ void HYDRO_2D::runOpenMPVariant(VariantID vid) RAJA::kernel( RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend), RAJA::RangeSegment(jbeg, jend)), - hydro2d_lam1); + hydro2d_lam1); RAJA::kernel( RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend), RAJA::RangeSegment(jbeg, jend)), - hydro2d_lam2); + hydro2d_lam2); RAJA::kernel( RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend), RAJA::RangeSegment(jbeg, jend)), - hydro2d_lam3); + hydro2d_lam3); - }); // end omp parallel region + }); // end omp parallel region } stopTimer(); @@ -168,12 +168,12 @@ void HYDRO_2D::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n HYDRO_2D : Unknown variant id = " << vid << std::endl; + getCout() << "\n HYDRO_2D : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/lcals/HYDRO_2D-OMPTarget.cpp b/src/lcals/HYDRO_2D-OMPTarget.cpp index ccd813749..ccac11396 100644 --- a/src/lcals/HYDRO_2D-OMPTarget.cpp +++ b/src/lcals/HYDRO_2D-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -73,7 +73,7 @@ void HYDRO_2D::runOpenMPTargetVariant(VariantID vid) #pragma omp target is_device_ptr(zadat, zbdat, zpdat, \ zqdat, zrdat, zmdat) device( did ) - #pragma omp teams distribute parallel for schedule(static, 1) collapse(2) + #pragma omp teams distribute parallel for schedule(static, 1) collapse(2) for (Index_type k = kbeg; k < kend; ++k ) { for (Index_type j = jbeg; j < jend; ++j ) { HYDRO_2D_BODY1; @@ -82,7 +82,7 @@ void HYDRO_2D::runOpenMPTargetVariant(VariantID vid) #pragma omp target is_device_ptr(zudat, zvdat, zadat, \ zbdat, zzdat, zrdat) device( did ) - #pragma omp teams distribute parallel for schedule(static, 1) collapse(2) + #pragma omp teams distribute parallel for schedule(static, 1) collapse(2) for (Index_type k = kbeg; k < kend; ++k ) { for (Index_type j = jbeg; j < jend; ++j ) { HYDRO_2D_BODY2; @@ -91,7 +91,7 @@ void HYDRO_2D::runOpenMPTargetVariant(VariantID vid) #pragma omp target is_device_ptr(zroutdat, zzoutdat, \ zrdat, zudat, zzdat, zvdat) device( did ) - #pragma omp teams distribute parallel for schedule(static, 1) collapse(2) + #pragma omp teams distribute parallel for schedule(static, 1) collapse(2) for (Index_type k = kbeg; k < kend; ++k ) { for (Index_type j = jbeg; j < jend; ++j ) { HYDRO_2D_BODY3; @@ -147,7 +147,7 @@ void HYDRO_2D::runOpenMPTargetVariant(VariantID vid) HYDRO_2D_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n HYDRO_2D : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n HYDRO_2D : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/lcals/HYDRO_2D-Seq.cpp b/src/lcals/HYDRO_2D-Seq.cpp index 1a4ecd726..023819bec 100644 --- a/src/lcals/HYDRO_2D-Seq.cpp +++ b/src/lcals/HYDRO_2D-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -128,17 +128,17 @@ void HYDRO_2D::runSeqVariant(VariantID vid) RAJA::kernel( RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend), RAJA::RangeSegment(jbeg, jend)), - hydro2d_lam1); + hydro2d_lam1); RAJA::kernel( RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend), RAJA::RangeSegment(jbeg, jend)), - hydro2d_lam2); + hydro2d_lam2); RAJA::kernel( RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend), RAJA::RangeSegment(jbeg, jend)), - hydro2d_lam3); + hydro2d_lam3); } stopTimer(); @@ -148,7 +148,7 @@ void HYDRO_2D::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n HYDRO_2D : Unknown variant id = " << vid << std::endl; + getCout() << "\n HYDRO_2D : Unknown variant id = " << vid << std::endl; } } diff --git a/src/lcals/INT_PREDICT-Cuda.cpp b/src/lcals/INT_PREDICT-Cuda.cpp index 651f5f862..5772a8e9c 100644 --- a/src/lcals/INT_PREDICT-Cuda.cpp +++ b/src/lcals/INT_PREDICT-Cuda.cpp @@ -94,7 +94,7 @@ void INT_PREDICT::runCudaVariant(VariantID vid) INT_PREDICT_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n INT_PREDICT : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n INT_PREDICT : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/lcals/INT_PREDICT-Hip.cpp b/src/lcals/INT_PREDICT-Hip.cpp index 8d758f70d..7a55f20ca 100644 --- a/src/lcals/INT_PREDICT-Hip.cpp +++ b/src/lcals/INT_PREDICT-Hip.cpp @@ -94,7 +94,7 @@ void INT_PREDICT::runHipVariant(VariantID vid) INT_PREDICT_DATA_TEARDOWN_HIP; } else { - std::cout << "\n INT_PREDICT : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n INT_PREDICT : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/lcals/INT_PREDICT-OMP.cpp b/src/lcals/INT_PREDICT-OMP.cpp index a5f8512af..486d9178e 100644 --- a/src/lcals/INT_PREDICT-OMP.cpp +++ b/src/lcals/INT_PREDICT-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -81,12 +81,12 @@ void INT_PREDICT::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n INT_PREDICT : Unknown variant id = " << vid << std::endl; + getCout() << "\n INT_PREDICT : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/lcals/INT_PREDICT-OMPTarget.cpp b/src/lcals/INT_PREDICT-OMPTarget.cpp index 44cceb4a7..4fcc54307 100644 --- a/src/lcals/INT_PREDICT-OMPTarget.cpp +++ b/src/lcals/INT_PREDICT-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -48,12 +48,12 @@ void INT_PREDICT::runOpenMPTargetVariant(VariantID vid) if ( vid == Base_OpenMPTarget ) { INT_PREDICT_DATA_SETUP_OMP_TARGET; - + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { #pragma omp target is_device_ptr(px) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) for (Index_type i = ibegin; i < iend; ++i ) { INT_PREDICT_BODY; } @@ -62,13 +62,13 @@ void INT_PREDICT::runOpenMPTargetVariant(VariantID vid) stopTimer(); INT_PREDICT_DATA_TEARDOWN_OMP_TARGET; - + } else if ( vid == RAJA_OpenMPTarget ) { INT_PREDICT_DATA_SETUP_OMP_TARGET; startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { RAJA::forall>( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { @@ -81,7 +81,7 @@ void INT_PREDICT::runOpenMPTargetVariant(VariantID vid) INT_PREDICT_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n INT_PREDICT : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n INT_PREDICT : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/lcals/INT_PREDICT-Seq.cpp b/src/lcals/INT_PREDICT-Seq.cpp index de69a7d4b..61032c6db 100644 --- a/src/lcals/INT_PREDICT-Seq.cpp +++ b/src/lcals/INT_PREDICT-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -79,7 +79,7 @@ void INT_PREDICT::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n INT_PREDICT : Unknown variant id = " << vid << std::endl; + getCout() << "\n INT_PREDICT : Unknown variant id = " << vid << std::endl; } } diff --git a/src/lcals/PLANCKIAN-Cuda.cpp b/src/lcals/PLANCKIAN-Cuda.cpp index fd46a4fdf..fe210a699 100644 --- a/src/lcals/PLANCKIAN-Cuda.cpp +++ b/src/lcals/PLANCKIAN-Cuda.cpp @@ -98,7 +98,7 @@ void PLANCKIAN::runCudaVariant(VariantID vid) PLANCKIAN_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n PLANCKIAN : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n PLANCKIAN : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/lcals/PLANCKIAN-Hip.cpp b/src/lcals/PLANCKIAN-Hip.cpp index f47d04ce9..c2e566ad8 100644 --- a/src/lcals/PLANCKIAN-Hip.cpp +++ b/src/lcals/PLANCKIAN-Hip.cpp @@ -98,7 +98,7 @@ void PLANCKIAN::runHipVariant(VariantID vid) PLANCKIAN_DATA_TEARDOWN_HIP; } else { - std::cout << "\n PLANCKIAN : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n PLANCKIAN : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/lcals/PLANCKIAN-OMP.cpp b/src/lcals/PLANCKIAN-OMP.cpp index 01c76f5ed..8a890654e 100644 --- a/src/lcals/PLANCKIAN-OMP.cpp +++ b/src/lcals/PLANCKIAN-OMP.cpp @@ -13,7 +13,7 @@ #include #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -82,12 +82,12 @@ void PLANCKIAN::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n PLANCKIAN : Unknown variant id = " << vid << std::endl; + getCout() << "\n PLANCKIAN : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/lcals/PLANCKIAN-OMPTarget.cpp b/src/lcals/PLANCKIAN-OMPTarget.cpp index 61fa12b1d..f6471f7b6 100644 --- a/src/lcals/PLANCKIAN-OMPTarget.cpp +++ b/src/lcals/PLANCKIAN-OMPTarget.cpp @@ -17,7 +17,7 @@ #include #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -57,12 +57,12 @@ void PLANCKIAN::runOpenMPTargetVariant(VariantID vid) if ( vid == Base_OpenMPTarget ) { PLANCKIAN_DATA_SETUP_OMP_TARGET; - + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { #pragma omp target is_device_ptr(x, y, u, v, w) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) for (Index_type i = ibegin; i < iend; ++i ) { PLANCKIAN_BODY; } @@ -70,12 +70,12 @@ void PLANCKIAN::runOpenMPTargetVariant(VariantID vid) } stopTimer(); - PLANCKIAN_DATA_TEARDOWN_OMP_TARGET; + PLANCKIAN_DATA_TEARDOWN_OMP_TARGET; } else if ( vid == RAJA_OpenMPTarget ) { - PLANCKIAN_DATA_SETUP_OMP_TARGET; - + PLANCKIAN_DATA_SETUP_OMP_TARGET; + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -87,10 +87,10 @@ void PLANCKIAN::runOpenMPTargetVariant(VariantID vid) } stopTimer(); - PLANCKIAN_DATA_TEARDOWN_OMP_TARGET; + PLANCKIAN_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n PLANCKIAN : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n PLANCKIAN : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/lcals/PLANCKIAN-Seq.cpp b/src/lcals/PLANCKIAN-Seq.cpp index fa5cb565e..88bcc04e0 100644 --- a/src/lcals/PLANCKIAN-Seq.cpp +++ b/src/lcals/PLANCKIAN-Seq.cpp @@ -13,7 +13,7 @@ #include #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -80,7 +80,7 @@ void PLANCKIAN::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n PLANCKIAN : Unknown variant id = " << vid << std::endl; + getCout() << "\n PLANCKIAN : Unknown variant id = " << vid << std::endl; } } diff --git a/src/lcals/TRIDIAG_ELIM-Cuda.cpp b/src/lcals/TRIDIAG_ELIM-Cuda.cpp index b06884f0e..276ec54a0 100644 --- a/src/lcals/TRIDIAG_ELIM-Cuda.cpp +++ b/src/lcals/TRIDIAG_ELIM-Cuda.cpp @@ -93,7 +93,7 @@ void TRIDIAG_ELIM::runCudaVariant(VariantID vid) TRIDIAG_ELIM_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n TRIDIAG_ELIM : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n TRIDIAG_ELIM : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/lcals/TRIDIAG_ELIM-Hip.cpp b/src/lcals/TRIDIAG_ELIM-Hip.cpp index 66ae4bad5..0a16664d0 100644 --- a/src/lcals/TRIDIAG_ELIM-Hip.cpp +++ b/src/lcals/TRIDIAG_ELIM-Hip.cpp @@ -93,7 +93,7 @@ void TRIDIAG_ELIM::runHipVariant(VariantID vid) TRIDIAG_ELIM_DATA_TEARDOWN_HIP; } else { - std::cout << "\n TRIDIAG_ELIM : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n TRIDIAG_ELIM : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/lcals/TRIDIAG_ELIM-OMP.cpp b/src/lcals/TRIDIAG_ELIM-OMP.cpp index b1773ee33..0bd108fb1 100644 --- a/src/lcals/TRIDIAG_ELIM-OMP.cpp +++ b/src/lcals/TRIDIAG_ELIM-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -81,12 +81,12 @@ void TRIDIAG_ELIM::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n TRIDIAG_ELIM : Unknown variant id = " << vid << std::endl; + getCout() << "\n TRIDIAG_ELIM : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp b/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp index 523b47dd9..39cb585d5 100644 --- a/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp +++ b/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -59,7 +59,7 @@ void TRIDIAG_ELIM::runOpenMPTargetVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { #pragma omp target is_device_ptr(xout, xin, y, z) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) for (Index_type i = ibegin; i < iend; ++i ) { TRIDIAG_ELIM_BODY; } @@ -79,15 +79,15 @@ void TRIDIAG_ELIM::runOpenMPTargetVariant(VariantID vid) RAJA::forall>( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { TRIDIAG_ELIM_BODY; - }); + }); } stopTimer(); TRIDIAG_ELIM_DATA_TEARDOWN_OMP_TARGET - } else { - std::cout << "\n TRIDIAG_ELIM : Unknown OMP Tagretvariant id = " << vid << std::endl; + } else { + getCout() << "\n TRIDIAG_ELIM : Unknown OMP Tagretvariant id = " << vid << std::endl; } } diff --git a/src/lcals/TRIDIAG_ELIM-Seq.cpp b/src/lcals/TRIDIAG_ELIM-Seq.cpp index 60303d353..b3bf160ab 100644 --- a/src/lcals/TRIDIAG_ELIM-Seq.cpp +++ b/src/lcals/TRIDIAG_ELIM-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace lcals { @@ -79,7 +79,7 @@ void TRIDIAG_ELIM::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n TRIDIAG_ELIM : Unknown variant id = " << vid << std::endl; + getCout() << "\n TRIDIAG_ELIM : Unknown variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_2MM-Cuda.cpp b/src/polybench/POLYBENCH_2MM-Cuda.cpp index f6165d74c..8e546379e 100644 --- a/src/polybench/POLYBENCH_2MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_2MM-Cuda.cpp @@ -82,7 +82,7 @@ __global__ void poly_2mm_1_lam(Index_type ni, Index_type nj, Index_type j = blockIdx.x * blockDim.x + threadIdx.x; if ( i < ni && j < nj ) { - body(i, j); + body(i, j); } } @@ -257,7 +257,7 @@ void POLYBENCH_2MM::runCudaVariant(VariantID vid) POLYBENCH_2MM_TEARDOWN_CUDA; } else { - std::cout << "\n POLYBENCH_2MM : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_2MM : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_2MM-Hip.cpp b/src/polybench/POLYBENCH_2MM-Hip.cpp index c3c9869b4..455ab55b2 100644 --- a/src/polybench/POLYBENCH_2MM-Hip.cpp +++ b/src/polybench/POLYBENCH_2MM-Hip.cpp @@ -130,14 +130,14 @@ void POLYBENCH_2MM::runHipVariant(VariantID vid) POLY_2MM_THREADS_PER_BLOCK_HIP; POLY_2MM_1_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_2mm_1), + hipLaunchKernelGGL((poly_2mm_1), dim3(nblocks1), dim3(nthreads_per_block), 0, 0, tmp, A, B, alpha, ni, nj, nk); hipErrchk( hipGetLastError() ); POLY_2MM_2_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_2mm_2), + hipLaunchKernelGGL((poly_2mm_2), dim3(nblocks2), dim3(nthreads_per_block), 0, 0, tmp, C, D, beta, ni, nl, nj); @@ -165,12 +165,12 @@ void POLYBENCH_2MM::runHipVariant(VariantID vid) POLYBENCH_2MM_BODY3; }; - POLY_2MM_1_NBLOCKS_HIP; + POLY_2MM_1_NBLOCKS_HIP; hipLaunchKernelGGL((poly_2mm_1_lam), dim3(nblocks1), dim3(nthreads_per_block), 0, 0, ni, nj, poly_2mm_1_lambda); hipErrchk( hipGetLastError() ); - + auto poly_2mm_2_lambda = [=] __device__ (Index_type i, Index_type l) { POLYBENCH_2MM_BODY4; for (Index_type j=0; j < nj; ++j) { @@ -264,7 +264,7 @@ void POLYBENCH_2MM::runHipVariant(VariantID vid) POLYBENCH_2MM_TEARDOWN_HIP; } else { - std::cout << "\n POLYBENCH_2MM : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_2MM : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_2MM-OMP.cpp b/src/polybench/POLYBENCH_2MM-OMP.cpp index 2fed3550a..a7778840a 100644 --- a/src/polybench/POLYBENCH_2MM-OMP.cpp +++ b/src/polybench/POLYBENCH_2MM-OMP.cpp @@ -20,7 +20,7 @@ //#undef USE_RAJA_OMP_COLLAPSE -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -45,7 +45,7 @@ void POLYBENCH_2MM::runOpenMPVariant(VariantID vid) #pragma omp parallel for collapse(2) #else #pragma omp parallel for -#endif +#endif for (Index_type i = 0; i < ni; i++ ) { for(Index_type j = 0; j < nj; j++) { POLYBENCH_2MM_BODY1; @@ -60,7 +60,7 @@ void POLYBENCH_2MM::runOpenMPVariant(VariantID vid) #pragma omp parallel for collapse(2) #else #pragma omp parallel for -#endif +#endif for(Index_type i = 0; i < ni; i++) { for(Index_type l = 0; l < nl; l++) { POLYBENCH_2MM_BODY4; @@ -142,7 +142,7 @@ void POLYBENCH_2MM::runOpenMPVariant(VariantID vid) auto poly_2mm_lam1 = [=](Real_type &dot) { POLYBENCH_2MM_BODY1_RAJA; }; - auto poly_2mm_lam2 = [=](Index_type i, Index_type j, Index_type k, + auto poly_2mm_lam2 = [=](Index_type i, Index_type j, Index_type k, Real_type &dot) { POLYBENCH_2MM_BODY2_RAJA; }; @@ -153,7 +153,7 @@ void POLYBENCH_2MM::runOpenMPVariant(VariantID vid) auto poly_2mm_lam4 = [=](Real_type &dot) { POLYBENCH_2MM_BODY4_RAJA; }; - auto poly_2mm_lam5 = [=](Index_type i, Index_type l, Index_type j, + auto poly_2mm_lam5 = [=](Index_type i, Index_type l, Index_type j, Real_type &dot) { POLYBENCH_2MM_BODY5_RAJA; }; @@ -192,7 +192,7 @@ void POLYBENCH_2MM::runOpenMPVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_param( + RAJA::kernel_param( RAJA::make_tuple(RAJA::RangeSegment{0, ni}, RAJA::RangeSegment{0, nj}, RAJA::RangeSegment{0, nk}), @@ -203,7 +203,7 @@ void POLYBENCH_2MM::runOpenMPVariant(VariantID vid) poly_2mm_lam3 ); - RAJA::kernel_param( + RAJA::kernel_param( RAJA::make_tuple(RAJA::RangeSegment{0, ni}, RAJA::RangeSegment{0, nl}, RAJA::RangeSegment{0, nj}), @@ -221,12 +221,12 @@ void POLYBENCH_2MM::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n POLYBENCH_2MM : Unknown variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_2MM : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/polybench/POLYBENCH_2MM-OMPTarget.cpp b/src/polybench/POLYBENCH_2MM-OMPTarget.cpp index ab68c8e2a..ce689e767 100644 --- a/src/polybench/POLYBENCH_2MM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_2MM-OMPTarget.cpp @@ -29,7 +29,7 @@ namespace polybench allocAndInitOpenMPDeviceData(A, m_A, m_ni * m_nk, did, hid); \ allocAndInitOpenMPDeviceData(B, m_B, m_nk * m_nj, did, hid); \ allocAndInitOpenMPDeviceData(C, m_C, m_nj * m_nl, did, hid); \ - allocAndInitOpenMPDeviceData(D, m_D, m_ni * m_nl, did, hid); + allocAndInitOpenMPDeviceData(D, m_D, m_ni * m_nl, did, hid); #define POLYBENCH_2MM_DATA_TEARDOWN_OMP_TARGET \ @@ -53,9 +53,9 @@ void POLYBENCH_2MM::runOpenMPTargetVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - + #pragma omp target is_device_ptr(tmp,A,B) device( did ) - #pragma omp teams distribute parallel for schedule(static, 1) collapse(2) + #pragma omp teams distribute parallel for schedule(static, 1) collapse(2) for (Index_type i = 0; i < ni; i++ ) { for(Index_type j = 0; j < nj; j++) { POLYBENCH_2MM_BODY1; @@ -75,11 +75,11 @@ void POLYBENCH_2MM::runOpenMPTargetVariant(VariantID vid) POLYBENCH_2MM_BODY5; } POLYBENCH_2MM_BODY6; - } + } } } - stopTimer(); + stopTimer(); POLYBENCH_2MM_DATA_TEARDOWN_OMP_TARGET; @@ -121,7 +121,7 @@ void POLYBENCH_2MM::runOpenMPTargetVariant(VariantID vid) } ); - RAJA::kernel_param( + RAJA::kernel_param( RAJA::make_tuple(RAJA::RangeSegment{0, ni}, RAJA::RangeSegment{0, nl}, RAJA::RangeSegment{0, nj}), @@ -144,7 +144,7 @@ void POLYBENCH_2MM::runOpenMPTargetVariant(VariantID vid) POLYBENCH_2MM_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n POLYBENCH_2MM : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_2MM : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_2MM-Seq.cpp b/src/polybench/POLYBENCH_2MM-Seq.cpp index 36e70e2bc..4eae8f13c 100644 --- a/src/polybench/POLYBENCH_2MM-Seq.cpp +++ b/src/polybench/POLYBENCH_2MM-Seq.cpp @@ -13,7 +13,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -31,7 +31,7 @@ void POLYBENCH_2MM::runSeqVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - for (Index_type i = 0; i < ni; i++ ) { + for (Index_type i = 0; i < ni; i++ ) { for (Index_type j = 0; j < nj; j++) { POLYBENCH_2MM_BODY1; for (Index_type k = 0; k < nk; k++) { @@ -114,7 +114,7 @@ void POLYBENCH_2MM::runSeqVariant(VariantID vid) auto poly_2mm_lam1 = [=](Real_type &dot) { POLYBENCH_2MM_BODY1_RAJA; }; - auto poly_2mm_lam2 = [=](Index_type i, Index_type j, Index_type k, + auto poly_2mm_lam2 = [=](Index_type i, Index_type j, Index_type k, Real_type &dot) { POLYBENCH_2MM_BODY2_RAJA; }; @@ -125,7 +125,7 @@ void POLYBENCH_2MM::runSeqVariant(VariantID vid) auto poly_2mm_lam4 = [=](Real_type &dot) { POLYBENCH_2MM_BODY4_RAJA; }; - auto poly_2mm_lam5 = [=](Index_type i, Index_type l, Index_type j, + auto poly_2mm_lam5 = [=](Index_type i, Index_type l, Index_type j, Real_type &dot) { POLYBENCH_2MM_BODY5_RAJA; }; @@ -150,25 +150,25 @@ void POLYBENCH_2MM::runSeqVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_param( + RAJA::kernel_param( RAJA::make_tuple(RAJA::RangeSegment{0, ni}, RAJA::RangeSegment{0, nj}, RAJA::RangeSegment{0, nk}), RAJA::tuple{0.0}, - poly_2mm_lam1, - poly_2mm_lam2, + poly_2mm_lam1, + poly_2mm_lam2, poly_2mm_lam3 ); - RAJA::kernel_param( + RAJA::kernel_param( RAJA::make_tuple(RAJA::RangeSegment{0, ni}, RAJA::RangeSegment{0, nl}, RAJA::RangeSegment{0, nj}), RAJA::tuple{0.0}, - poly_2mm_lam4, - poly_2mm_lam5, + poly_2mm_lam4, + poly_2mm_lam5, poly_2mm_lam6 ); @@ -180,7 +180,7 @@ void POLYBENCH_2MM::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n POLYBENCH_2MM : Unknown variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_2MM : Unknown variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_3MM-Cuda.cpp b/src/polybench/POLYBENCH_3MM-Cuda.cpp index 956efb427..e861ed010 100644 --- a/src/polybench/POLYBENCH_3MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_3MM-Cuda.cpp @@ -330,7 +330,7 @@ void POLYBENCH_3MM::runCudaVariant(VariantID vid) POLYBENCH_3MM_TEARDOWN_CUDA; } else { - std::cout << "\n POLYBENCH_3MM : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_3MM : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_3MM-Hip.cpp b/src/polybench/POLYBENCH_3MM-Hip.cpp index 51e8ac53f..717a025da 100644 --- a/src/polybench/POLYBENCH_3MM-Hip.cpp +++ b/src/polybench/POLYBENCH_3MM-Hip.cpp @@ -163,22 +163,22 @@ void POLYBENCH_3MM::runHipVariant(VariantID vid) POLY_3MM_THREADS_PER_BLOCK_HIP; - POLY_3MM_1_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_1), + POLY_3MM_1_NBLOCKS_HIP; + hipLaunchKernelGGL((poly_3mm_1), dim3(nblocks1) , dim3(nthreads_per_block), 0, 0, E, A, B, ni, nj, nk); hipErrchk( hipGetLastError() ); POLY_3MM_2_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_2), + hipLaunchKernelGGL((poly_3mm_2), dim3(nblocks2), dim3(nthreads_per_block), 0, 0, F, C, D, nj, nl, nm); hipErrchk( hipGetLastError() ); POLY_3MM_3_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_3), + hipLaunchKernelGGL((poly_3mm_3), dim3(nblocks3), dim3(nthreads_per_block), 0, 0, G, E, F, ni, nl, nj); @@ -270,7 +270,7 @@ void POLYBENCH_3MM::runHipVariant(VariantID vid) > > > - >; + >; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -341,7 +341,7 @@ void POLYBENCH_3MM::runHipVariant(VariantID vid) POLYBENCH_3MM_TEARDOWN_HIP; } else { - std::cout << "\n POLYBENCH_3MM : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_3MM : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_3MM-OMP.cpp b/src/polybench/POLYBENCH_3MM-OMP.cpp index 5c8a595b7..7fe78a498 100644 --- a/src/polybench/POLYBENCH_3MM-OMP.cpp +++ b/src/polybench/POLYBENCH_3MM-OMP.cpp @@ -21,12 +21,12 @@ //#undef USE_RAJA_OMP_COLLAPSE -namespace rajaperf +namespace rajaperf { namespace polybench { - + void POLYBENCH_3MM::runOpenMPVariant(VariantID vid) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -45,7 +45,7 @@ void POLYBENCH_3MM::runOpenMPVariant(VariantID vid) #if defined(USE_OMP_COLLAPSE) #pragma omp parallel for collapse(2) #else - #pragma omp parallel for + #pragma omp parallel for #endif for (Index_type i = 0; i < ni; i++ ) { for (Index_type j = 0; j < nj; j++) { @@ -60,7 +60,7 @@ void POLYBENCH_3MM::runOpenMPVariant(VariantID vid) #if defined(USE_OMP_COLLAPSE) #pragma omp parallel for collapse(2) #else - #pragma omp parallel for + #pragma omp parallel for #endif for (Index_type j = 0; j < nj; j++) { for (Index_type l = 0; l < nl; l++) { @@ -75,7 +75,7 @@ void POLYBENCH_3MM::runOpenMPVariant(VariantID vid) #if defined(USE_OMP_COLLAPSE) #pragma omp parallel for collapse(2) #else - #pragma omp parallel for + #pragma omp parallel for #endif for (Index_type i = 0; i < ni; i++) { for (Index_type l = 0; l < nl; l++) { @@ -181,7 +181,7 @@ void POLYBENCH_3MM::runOpenMPVariant(VariantID vid) auto poly_3mm_lam1 = [=] (Real_type &dot) { POLYBENCH_3MM_BODY1_RAJA; }; - auto poly_3mm_lam2 = [=] (Index_type i, Index_type j, Index_type k, + auto poly_3mm_lam2 = [=] (Index_type i, Index_type j, Index_type k, Real_type &dot) { POLYBENCH_3MM_BODY2_RAJA; }; @@ -192,7 +192,7 @@ void POLYBENCH_3MM::runOpenMPVariant(VariantID vid) auto poly_3mm_lam4 = [=] (Real_type &dot) { POLYBENCH_3MM_BODY4_RAJA; }; - auto poly_3mm_lam5 = [=] (Index_type j, Index_type l, Index_type m, + auto poly_3mm_lam5 = [=] (Index_type j, Index_type l, Index_type m, Real_type &dot) { POLYBENCH_3MM_BODY5_RAJA; }; @@ -203,7 +203,7 @@ void POLYBENCH_3MM::runOpenMPVariant(VariantID vid) auto poly_3mm_lam7 = [=] (Real_type &dot) { POLYBENCH_3MM_BODY7_RAJA; }; - auto poly_3mm_lam8 = [=] (Index_type i, Index_type l, Index_type j, + auto poly_3mm_lam8 = [=] (Index_type i, Index_type l, Index_type j, Real_type &dot) { POLYBENCH_3MM_BODY8_RAJA; }; @@ -285,12 +285,12 @@ void POLYBENCH_3MM::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n POLYBENCH_2MM : Unknown variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_2MM : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/polybench/POLYBENCH_3MM-OMPTarget.cpp b/src/polybench/POLYBENCH_3MM-OMPTarget.cpp index db965fc18..f7380fabd 100644 --- a/src/polybench/POLYBENCH_3MM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_3MM-OMPTarget.cpp @@ -31,7 +31,7 @@ namespace polybench allocAndInitOpenMPDeviceData(D, m_D, m_nm * m_nl, did, hid); \ allocAndInitOpenMPDeviceData(E, m_E, m_ni * m_nj, did, hid); \ allocAndInitOpenMPDeviceData(F, m_F, m_nj * m_nl, did, hid); \ - allocAndInitOpenMPDeviceData(G, m_G, m_ni * m_nl, did, hid); + allocAndInitOpenMPDeviceData(G, m_G, m_ni * m_nl, did, hid); #define POLYBENCH_3MM_DATA_TEARDOWN_OMP_TARGET \ @@ -56,7 +56,7 @@ void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - + #pragma omp target is_device_ptr(A,B,E) device( did ) #pragma omp teams distribute parallel for schedule(static, 1) collapse(2) for (Index_type i = 0; i < ni; i++ ) { @@ -94,7 +94,7 @@ void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid) } } - stopTimer(); + stopTimer(); POLYBENCH_3MM_DATA_TEARDOWN_OMP_TARGET; @@ -107,7 +107,7 @@ void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::Collapse, + RAJA::ArgList<0, 1>, RAJA::statement::Lambda<0, RAJA::Params<0>>, RAJA::statement::For<2, RAJA::seq_exec, RAJA::statement::Lambda<1, RAJA::Segs<0,1,2>, RAJA::Params<0>> @@ -118,8 +118,8 @@ void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::kernel_param( + + RAJA::kernel_param( RAJA::make_tuple(RAJA::RangeSegment{0, ni}, RAJA::RangeSegment{0, nj}, RAJA::RangeSegment{0, nk}), @@ -128,18 +128,18 @@ void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid) [=] (Real_type &dot) { POLYBENCH_3MM_BODY1_RAJA; }, - [=] (Index_type i, Index_type j, Index_type k, + [=] (Index_type i, Index_type j, Index_type k, Real_type &dot) { POLYBENCH_3MM_BODY2_RAJA; }, - [=] (Index_type i, Index_type j, + [=] (Index_type i, Index_type j, Real_type &dot) { POLYBENCH_3MM_BODY3_RAJA; } ); - RAJA::kernel_param( + RAJA::kernel_param( RAJA::make_tuple(RAJA::RangeSegment{0, nj}, RAJA::RangeSegment{0, nl}, RAJA::RangeSegment{0, nm}), @@ -148,18 +148,18 @@ void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid) [=] (Real_type &dot) { POLYBENCH_3MM_BODY4_RAJA; }, - [=] (Index_type j, Index_type l, Index_type m, + [=] (Index_type j, Index_type l, Index_type m, Real_type &dot) { POLYBENCH_3MM_BODY5_RAJA; }, - [=] (Index_type j, Index_type l, + [=] (Index_type j, Index_type l, Real_type &dot) { POLYBENCH_3MM_BODY6_RAJA; } ); - RAJA::kernel_param( + RAJA::kernel_param( RAJA::make_tuple(RAJA::RangeSegment{0, ni}, RAJA::RangeSegment{0, nl}, RAJA::RangeSegment{0, nj}), @@ -168,16 +168,16 @@ void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid) [=] (Real_type &dot) { POLYBENCH_3MM_BODY7_RAJA; }, - [=] (Index_type i, Index_type l, Index_type j, + [=] (Index_type i, Index_type l, Index_type j, Real_type &dot) { POLYBENCH_3MM_BODY8_RAJA; }, - [=] (Index_type i, Index_type l, + [=] (Index_type i, Index_type l, Real_type &dot) { POLYBENCH_3MM_BODY9_RAJA; } - ); + ); } stopTimer(); @@ -185,7 +185,7 @@ void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid) POLYBENCH_3MM_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n POLYBENCH_3MM : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_3MM : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_3MM-Seq.cpp b/src/polybench/POLYBENCH_3MM-Seq.cpp index 6af320fd8..0659026cc 100644 --- a/src/polybench/POLYBENCH_3MM-Seq.cpp +++ b/src/polybench/POLYBENCH_3MM-Seq.cpp @@ -14,12 +14,12 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { - + void POLYBENCH_3MM::runSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -143,7 +143,7 @@ void POLYBENCH_3MM::runSeqVariant(VariantID vid) auto poly_3mm_lam1 = [=] (Real_type &dot) { POLYBENCH_3MM_BODY1_RAJA; }; - auto poly_3mm_lam2 = [=] (Index_type i, Index_type j, Index_type k, + auto poly_3mm_lam2 = [=] (Index_type i, Index_type j, Index_type k, Real_type &dot) { POLYBENCH_3MM_BODY2_RAJA; }; @@ -154,7 +154,7 @@ void POLYBENCH_3MM::runSeqVariant(VariantID vid) auto poly_3mm_lam4 = [=] (Real_type &dot) { POLYBENCH_3MM_BODY4_RAJA; }; - auto poly_3mm_lam5 = [=] (Index_type j, Index_type l, Index_type m, + auto poly_3mm_lam5 = [=] (Index_type j, Index_type l, Index_type m, Real_type &dot) { POLYBENCH_3MM_BODY5_RAJA; }; @@ -165,7 +165,7 @@ void POLYBENCH_3MM::runSeqVariant(VariantID vid) auto poly_3mm_lam7 = [=] (Real_type &dot) { POLYBENCH_3MM_BODY7_RAJA; }; - auto poly_3mm_lam8 = [=] (Index_type i, Index_type l, Index_type j, + auto poly_3mm_lam8 = [=] (Index_type i, Index_type l, Index_type j, Real_type &dot) { POLYBENCH_3MM_BODY8_RAJA; }; @@ -212,7 +212,7 @@ void POLYBENCH_3MM::runSeqVariant(VariantID vid) poly_3mm_lam5, poly_3mm_lam6 - ); + ); RAJA::kernel_param( RAJA::make_tuple(RAJA::RangeSegment{0, ni}, @@ -234,7 +234,7 @@ void POLYBENCH_3MM::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n POLYBENCH_2MM : Unknown variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_2MM : Unknown variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_ADI-Cuda.cpp b/src/polybench/POLYBENCH_ADI-Cuda.cpp index 57500408d..2843c77f3 100644 --- a/src/polybench/POLYBENCH_ADI-Cuda.cpp +++ b/src/polybench/POLYBENCH_ADI-Cuda.cpp @@ -243,7 +243,7 @@ void POLYBENCH_ADI::runCudaVariant(VariantID vid) POLYBENCH_ADI_TEARDOWN_CUDA } else { - std::cout << "\n POLYBENCH_ADI : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_ADI : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_ADI-Hip.cpp b/src/polybench/POLYBENCH_ADI-Hip.cpp index 9c65190a4..6c9874006 100644 --- a/src/polybench/POLYBENCH_ADI-Hip.cpp +++ b/src/polybench/POLYBENCH_ADI-Hip.cpp @@ -105,14 +105,14 @@ void POLYBENCH_ADI::runHipVariant(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(n-2, block_size); - hipLaunchKernelGGL((adi1), + hipLaunchKernelGGL((adi1), dim3(grid_size), dim3(block_size), 0, 0, n, a, b, c, d, f, P, Q, U, V); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((adi2), + hipLaunchKernelGGL((adi2), dim3(grid_size), dim3(block_size), 0, 0, n, a, c, d, e, f, @@ -252,7 +252,7 @@ void POLYBENCH_ADI::runHipVariant(VariantID vid) POLYBENCH_ADI_TEARDOWN_HIP } else { - std::cout << "\n POLYBENCH_ADI : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_ADI : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_ADI-OMP.cpp b/src/polybench/POLYBENCH_ADI-OMP.cpp index 465237c43..42edab15b 100644 --- a/src/polybench/POLYBENCH_ADI-OMP.cpp +++ b/src/polybench/POLYBENCH_ADI-OMP.cpp @@ -13,7 +13,7 @@ #include #include -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -34,18 +34,18 @@ void POLYBENCH_ADI::runOpenMPVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - for (Index_type t = 1; t <= tsteps; ++t) { + for (Index_type t = 1; t <= tsteps; ++t) { #pragma omp parallel for for (Index_type i = 1; i < n-1; ++i) { POLYBENCH_ADI_BODY2; for (Index_type j = 1; j < n-1; ++j) { POLYBENCH_ADI_BODY3; - } + } POLYBENCH_ADI_BODY4; for (Index_type k = n-2; k >= 1; --k) { POLYBENCH_ADI_BODY5; - } + } } #pragma omp parallel for @@ -57,7 +57,7 @@ void POLYBENCH_ADI::runOpenMPVariant(VariantID vid) POLYBENCH_ADI_BODY8; for (Index_type k = n-2; k >= 1; --k) { POLYBENCH_ADI_BODY9; - } + } } } // tstep loop @@ -213,12 +213,12 @@ void POLYBENCH_ADI::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\nPOLYBENCH_ADI Unknown variant id = " << vid << std::endl; + getCout() << "\nPOLYBENCH_ADI Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/polybench/POLYBENCH_ADI-OMPTarget.cpp b/src/polybench/POLYBENCH_ADI-OMPTarget.cpp index df935ee78..0d04bb597 100644 --- a/src/polybench/POLYBENCH_ADI-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_ADI-OMPTarget.cpp @@ -33,14 +33,14 @@ namespace polybench allocAndInitOpenMPDeviceData(U, m_U, m_n * m_n, did, hid); \ allocAndInitOpenMPDeviceData(V, m_V, m_n * m_n, did, hid); \ allocAndInitOpenMPDeviceData(P, m_P, m_n * m_n, did, hid); \ - allocAndInitOpenMPDeviceData(Q, m_Q, m_n * m_n, did, hid); + allocAndInitOpenMPDeviceData(Q, m_Q, m_n * m_n, did, hid); #define POLYBENCH_ADI_DATA_TEARDOWN_OMP_TARGET \ getOpenMPDeviceData(m_U, U, m_n * m_n, hid, did); \ deallocOpenMPDeviceData(U, did); \ deallocOpenMPDeviceData(V, did); \ deallocOpenMPDeviceData(P, did); \ - deallocOpenMPDeviceData(Q, did); + deallocOpenMPDeviceData(Q, did); void POLYBENCH_ADI::runOpenMPTargetVariant(VariantID vid) @@ -56,7 +56,7 @@ void POLYBENCH_ADI::runOpenMPTargetVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - for (Index_type t = 1; t <= tsteps; ++t) { + for (Index_type t = 1; t <= tsteps; ++t) { #pragma omp target is_device_ptr(P,Q,U,V) device( did ) #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) @@ -64,11 +64,11 @@ void POLYBENCH_ADI::runOpenMPTargetVariant(VariantID vid) POLYBENCH_ADI_BODY2; for (Index_type j = 1; j < n-1; ++j) { POLYBENCH_ADI_BODY3; - } + } POLYBENCH_ADI_BODY4; for (Index_type k = n-2; k >= 1; --k) { POLYBENCH_ADI_BODY5; - } + } } #pragma omp target is_device_ptr(P,Q,U,V) device( did ) @@ -86,10 +86,10 @@ void POLYBENCH_ADI::runOpenMPTargetVariant(VariantID vid) } // tsteps - } // run_reps - stopTimer(); + } // run_reps + stopTimer(); - POLYBENCH_ADI_DATA_TEARDOWN_OMP_TARGET; + POLYBENCH_ADI_DATA_TEARDOWN_OMP_TARGET; } else if ( vid == RAJA_OpenMPTarget ) { @@ -162,9 +162,9 @@ void POLYBENCH_ADI::runOpenMPTargetVariant(VariantID vid) POLYBENCH_ADI_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n POLYBENCH_ADI : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_ADI : Unknown OMP Target variant id = " << vid << std::endl; } -} +} } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_ADI-Seq.cpp b/src/polybench/POLYBENCH_ADI-Seq.cpp index 183d56c4c..b4f8e82c3 100644 --- a/src/polybench/POLYBENCH_ADI-Seq.cpp +++ b/src/polybench/POLYBENCH_ADI-Seq.cpp @@ -13,7 +13,7 @@ #include #include -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -31,17 +31,17 @@ void POLYBENCH_ADI::runSeqVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - for (Index_type t = 1; t <= tsteps; ++t) { + for (Index_type t = 1; t <= tsteps; ++t) { for (Index_type i = 1; i < n-1; ++i) { POLYBENCH_ADI_BODY2; for (Index_type j = 1; j < n-1; ++j) { POLYBENCH_ADI_BODY3; - } + } POLYBENCH_ADI_BODY4; for (Index_type k = n-2; k >= 1; --k) { POLYBENCH_ADI_BODY5; - } + } } for (Index_type i = 1; i < n-1; ++i) { @@ -52,7 +52,7 @@ void POLYBENCH_ADI::runSeqVariant(VariantID vid) POLYBENCH_ADI_BODY8; for (Index_type k = n-2; k >= 1; --k) { POLYBENCH_ADI_BODY9; - } + } } } // tstep loop @@ -172,9 +172,9 @@ void POLYBENCH_ADI::runSeqVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - for (Index_type t = 1; t <= tsteps; ++t) { + for (Index_type t = 1; t <= tsteps; ++t) { - RAJA::kernel( + RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment{1, n-1}, RAJA::RangeSegment{1, n-1}, RAJA::RangeStrideSegment{n-2, 0, -1}), @@ -208,7 +208,7 @@ void POLYBENCH_ADI::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\nPOLYBENCH_ADI Unknown variant id = " << vid << std::endl; + getCout() << "\nPOLYBENCH_ADI Unknown variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_ATAX-Cuda.cpp b/src/polybench/POLYBENCH_ATAX-Cuda.cpp index 58d37fb80..69fcf77a3 100644 --- a/src/polybench/POLYBENCH_ATAX-Cuda.cpp +++ b/src/polybench/POLYBENCH_ATAX-Cuda.cpp @@ -225,7 +225,7 @@ void POLYBENCH_ATAX::runCudaVariant(VariantID vid) POLYBENCH_ATAX_TEARDOWN_CUDA; } else { - std::cout << "\n POLYBENCH_ATAX : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_ATAX : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_ATAX-Hip.cpp b/src/polybench/POLYBENCH_ATAX-Hip.cpp index 6d393a83b..f9d532e48 100644 --- a/src/polybench/POLYBENCH_ATAX-Hip.cpp +++ b/src/polybench/POLYBENCH_ATAX-Hip.cpp @@ -96,12 +96,12 @@ void POLYBENCH_ATAX::runHipVariant(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); - hipLaunchKernelGGL((poly_atax_1), + hipLaunchKernelGGL((poly_atax_1), dim3(grid_size), dim3(block_size), 0, 0, A, x, y, tmp, N); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_atax_2), + hipLaunchKernelGGL((poly_atax_2), dim3(grid_size), dim3(block_size), 0, 0, A, tmp, y, N); hipErrchk( hipGetLastError() ); @@ -232,7 +232,7 @@ void POLYBENCH_ATAX::runHipVariant(VariantID vid) POLYBENCH_ATAX_TEARDOWN_HIP; } else { - std::cout << "\n POLYBENCH_ATAX : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_ATAX : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_ATAX-OMP.cpp b/src/polybench/POLYBENCH_ATAX-OMP.cpp index 8b7bf1113..cb78dd1cc 100644 --- a/src/polybench/POLYBENCH_ATAX-OMP.cpp +++ b/src/polybench/POLYBENCH_ATAX-OMP.cpp @@ -60,19 +60,19 @@ void POLYBENCH_ATAX::runOpenMPVariant(VariantID vid) case Lambda_OpenMP : { - auto poly_atax_base_lam2 = [=] (Index_type i, Index_type j, + auto poly_atax_base_lam2 = [=] (Index_type i, Index_type j, Real_type &dot) { POLYBENCH_ATAX_BODY2; }; - auto poly_atax_base_lam3 = [=] (Index_type i, + auto poly_atax_base_lam3 = [=] (Index_type i, Real_type &dot) { POLYBENCH_ATAX_BODY3; }; - auto poly_atax_base_lam5 = [=] (Index_type i, Index_type j , + auto poly_atax_base_lam5 = [=] (Index_type i, Index_type j , Real_type &dot) { POLYBENCH_ATAX_BODY5; }; - auto poly_atax_base_lam6 = [=] (Index_type j, + auto poly_atax_base_lam6 = [=] (Index_type j, Real_type &dot) { POLYBENCH_ATAX_BODY6; }; @@ -148,10 +148,10 @@ void POLYBENCH_ATAX::runOpenMPVariant(VariantID vid) > >; - + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - + RAJA::kernel_param( RAJA::make_tuple(RAJA::RangeSegment{0, N}, RAJA::RangeSegment{0, N}), @@ -172,21 +172,21 @@ void POLYBENCH_ATAX::runOpenMPVariant(VariantID vid) poly_atax_lam5, poly_atax_lam6 - ); + ); } stopTimer(); - + break; } default : { - std::cout << "\n POLYBENCH_ATAX : Unknown variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_ATAX : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp b/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp index 69b50d8cb..7f9b96a75 100644 --- a/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -66,7 +66,7 @@ void POLYBENCH_ATAX::runOpenMPTargetVariant(VariantID vid) } POLYBENCH_ATAX_BODY3; } - + #pragma omp target is_device_ptr(y,tmp,A) device( did ) #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) for (Index_type j = 0; j < N; ++j ) { @@ -153,7 +153,7 @@ void POLYBENCH_ATAX::runOpenMPTargetVariant(VariantID vid) POLYBENCH_ATAX_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n POLYBENCH_ATAX : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_ATAX : Unknown OMP Target variant id = " << vid << std::endl; } } @@ -162,4 +162,4 @@ void POLYBENCH_ATAX::runOpenMPTargetVariant(VariantID vid) } // end namespace rajaperf #endif // RAJA_ENABLE_TARGET_OPENMP - + diff --git a/src/polybench/POLYBENCH_ATAX-Seq.cpp b/src/polybench/POLYBENCH_ATAX-Seq.cpp index f4bb51937..5f6d018b6 100644 --- a/src/polybench/POLYBENCH_ATAX-Seq.cpp +++ b/src/polybench/POLYBENCH_ATAX-Seq.cpp @@ -57,19 +57,19 @@ void POLYBENCH_ATAX::runSeqVariant(VariantID vid) #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { - auto poly_atax_base_lam2 = [=] (Index_type i, Index_type j, + auto poly_atax_base_lam2 = [=] (Index_type i, Index_type j, Real_type &dot) { POLYBENCH_ATAX_BODY2; }; - auto poly_atax_base_lam3 = [=] (Index_type i, + auto poly_atax_base_lam3 = [=] (Index_type i, Real_type &dot) { POLYBENCH_ATAX_BODY3; }; - auto poly_atax_base_lam5 = [=] (Index_type i, Index_type j , + auto poly_atax_base_lam5 = [=] (Index_type i, Index_type j , Real_type &dot) { POLYBENCH_ATAX_BODY5; }; - auto poly_atax_base_lam6 = [=] (Index_type j, + auto poly_atax_base_lam6 = [=] (Index_type j, Real_type &dot) { POLYBENCH_ATAX_BODY6; }; @@ -148,8 +148,8 @@ void POLYBENCH_ATAX::runSeqVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_param( - RAJA::make_tuple(RAJA::RangeSegment{0, N}, + RAJA::kernel_param( + RAJA::make_tuple(RAJA::RangeSegment{0, N}, RAJA::RangeSegment{0, N}), RAJA::tuple{0.0}, @@ -158,8 +158,8 @@ void POLYBENCH_ATAX::runSeqVariant(VariantID vid) poly_atax_lam3 ); - - RAJA::kernel_param( + + RAJA::kernel_param( RAJA::make_tuple(RAJA::RangeSegment{0, N}, RAJA::RangeSegment{0, N}), RAJA::tuple{0.0}, @@ -178,7 +178,7 @@ void POLYBENCH_ATAX::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n POLYBENCH_ATAX : Unknown variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_ATAX : Unknown variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp index a6c67b852..f26754de3 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp @@ -72,7 +72,7 @@ __global__ void poly_fdtd2d_1_lam(Index_type ny, Lambda body) } } -__global__ void poly_fdtd2d_2(Real_ptr ey, Real_ptr hz, +__global__ void poly_fdtd2d_2(Real_ptr ey, Real_ptr hz, Index_type nx, Index_type ny) { Index_type i = blockIdx.y * blockDim.y + threadIdx.y; @@ -91,7 +91,7 @@ __global__ void poly_fdtd2d_2_lam(Index_type nx, Index_type ny, Index_type j = blockIdx.x * blockDim.x + threadIdx.x; if (i > 0 && i < nx && j < ny) { - body(i, j); + body(i, j); } } @@ -296,7 +296,7 @@ void POLYBENCH_FDTD_2D::runCudaVariant(VariantID vid) POLYBENCH_FDTD_2D_TEARDOWN_CUDA; } else { - std::cout << "\n POLYBENCH_FDTD_2D : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_FDTD_2D : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp index b627d84f8..74488952e 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp @@ -157,25 +157,25 @@ void POLYBENCH_FDTD_2D::runHipVariant(VariantID vid) for (t = 0; t < tsteps; ++t) { const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(ny, block_size); - hipLaunchKernelGGL((poly_fdtd2d_1), - dim3(grid_size1), dim3(block_size), 0, 0, + hipLaunchKernelGGL((poly_fdtd2d_1), + dim3(grid_size1), dim3(block_size), 0, 0, ey, fict, ny, t); hipErrchk( hipGetLastError() ); FDTD_2D_THREADS_PER_BLOCK_HIP; FDTD_2D_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_fdtd2d_2), + hipLaunchKernelGGL((poly_fdtd2d_2), dim3(nblocks234), dim3(nthreads_per_block234), 0, 0, ey, hz, nx, ny); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_fdtd2d_3), + hipLaunchKernelGGL((poly_fdtd2d_3), dim3(nblocks234), dim3(nthreads_per_block234), 0, 0, ex, hz, nx, ny); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_fdtd2d_4), + hipLaunchKernelGGL((poly_fdtd2d_4), dim3(nblocks234), dim3(nthreads_per_block234), 0, 0, hz, ex, ey, nx, ny); hipErrchk( hipGetLastError() ); @@ -210,7 +210,7 @@ void POLYBENCH_FDTD_2D::runHipVariant(VariantID vid) FDTD_2D_THREADS_PER_BLOCK_HIP; FDTD_2D_NBLOCKS_HIP; - auto poly_fdtd2d_2_lambda = + auto poly_fdtd2d_2_lambda = [=] __device__ (Index_type i, Index_type j) { POLYBENCH_FDTD_2D_BODY2; }; @@ -220,7 +220,7 @@ void POLYBENCH_FDTD_2D::runHipVariant(VariantID vid) nx, ny, poly_fdtd2d_2_lambda); hipErrchk( hipGetLastError() ); - auto poly_fdtd2d_3_lambda = + auto poly_fdtd2d_3_lambda = [=] __device__ (Index_type i, Index_type j) { POLYBENCH_FDTD_2D_BODY3; }; @@ -229,8 +229,8 @@ void POLYBENCH_FDTD_2D::runHipVariant(VariantID vid) dim3(nblocks234), dim3(nthreads_per_block234), 0, 0, nx, ny, poly_fdtd2d_3_lambda); hipErrchk( hipGetLastError() ); - - auto poly_fdtd2d_4_lambda = + + auto poly_fdtd2d_4_lambda = [=] __device__ (Index_type i, Index_type j) { POLYBENCH_FDTD_2D_BODY4; }; @@ -314,7 +314,7 @@ void POLYBENCH_FDTD_2D::runHipVariant(VariantID vid) POLYBENCH_FDTD_2D_TEARDOWN_HIP; } else { - std::cout << "\n POLYBENCH_FDTD_2D : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_FDTD_2D : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp b/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp index 157b4b12a..ebd027d6f 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -146,7 +146,7 @@ void POLYBENCH_FDTD_2D::runOpenMPVariant(VariantID vid) using EXEC_POL1 = RAJA::omp_parallel_for_exec; - using EXEC_POL234 = + using EXEC_POL234 = RAJA::KernelPolicy< RAJA::statement::For<0, RAJA::omp_parallel_for_exec, RAJA::statement::For<1, RAJA::loop_exec, @@ -191,12 +191,12 @@ void POLYBENCH_FDTD_2D::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\nPOLYBENCH_FDTD_2D Unknown variant id = " << vid << std::endl; + getCout() << "\nPOLYBENCH_FDTD_2D Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp b/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp index 0bde775fd..d260754bf 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp @@ -4,7 +4,7 @@ // See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "POLYBENCH_FDTD_2D.hpp" @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -153,7 +153,7 @@ void POLYBENCH_FDTD_2D::runOpenMPTargetVariant(VariantID vid) POLYBENCH_FDTD_2D_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n POLYBENCH_FDTD_2D : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_FDTD_2D : Unknown OMP Target variant id = " << vid << std::endl; } } @@ -162,4 +162,4 @@ void POLYBENCH_FDTD_2D::runOpenMPTargetVariant(VariantID vid) } // end namespace rajaperf #endif // RAJA_ENABLE_TARGET_OPENMP - + diff --git a/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp b/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp index 4e6078778..58461dd8a 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -31,7 +31,7 @@ void POLYBENCH_FDTD_2D::runSeqVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - for (t = 0; t < tsteps; ++t) { + for (t = 0; t < tsteps; ++t) { for (Index_type j = 0; j < ny; j++) { POLYBENCH_FDTD_2D_BODY1; @@ -137,7 +137,7 @@ void POLYBENCH_FDTD_2D::runSeqVariant(VariantID vid) using EXEC_POL1 = RAJA::loop_exec; - using EXEC_POL234 = + using EXEC_POL234 = RAJA::KernelPolicy< RAJA::statement::For<0, RAJA::loop_exec, RAJA::statement::For<1, RAJA::loop_exec, @@ -149,9 +149,9 @@ void POLYBENCH_FDTD_2D::runSeqVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - for (t = 0; t < tsteps; ++t) { + for (t = 0; t < tsteps; ++t) { - RAJA::forall( RAJA::RangeSegment(0, ny), + RAJA::forall( RAJA::RangeSegment(0, ny), poly_fdtd2d_lam1 ); @@ -184,7 +184,7 @@ void POLYBENCH_FDTD_2D::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\nPOLYBENCH_FDTD_2D Unknown variant id = " << vid << std::endl; + getCout() << "\nPOLYBENCH_FDTD_2D Unknown variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp index bc4d79352..6a49073b8 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp @@ -52,13 +52,13 @@ __global__ void poly_floyd_warshall(Real_ptr pout, Real_ptr pin, Index_type i = blockIdx.y * blockDim.y + threadIdx.y; Index_type j = blockIdx.x * blockDim.x + threadIdx.x; - if ( i < N && j < N ) { + if ( i < N && j < N ) { POLYBENCH_FLOYD_WARSHALL_BODY; } } template< typename Lambda > -__global__ void poly_floyd_warshall_lam(Index_type N, +__global__ void poly_floyd_warshall_lam(Index_type N, Lambda body) { Index_type i = blockIdx.y * blockDim.y + threadIdx.y; @@ -87,7 +87,7 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariant(VariantID vid) POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_CUDA; POLY_FLOYD_WARSHALL_NBLOCKS_CUDA; - + poly_floyd_warshall<<>>(pout, pin, k, N); cudaErrchk( cudaGetLastError() ); @@ -166,7 +166,7 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariant(VariantID vid) POLYBENCH_FLOYD_WARSHALL_TEARDOWN_CUDA; } else { - std::cout << "\n POLYBENCH_FLOYD_WARSHALL : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_FLOYD_WARSHALL : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp index af451b139..853ab1023 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp @@ -109,13 +109,13 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariant(VariantID vid) for (Index_type k = 0; k < N; ++k) { - auto poly_floyd_warshall_lambda = + auto poly_floyd_warshall_lambda = [=] __device__ (Index_type i, Index_type j) { POLYBENCH_FLOYD_WARSHALL_BODY; }; POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_HIP; - POLY_FLOYD_WARSHALL_NBLOCKS_HIP; + POLY_FLOYD_WARSHALL_NBLOCKS_HIP; hipLaunchKernelGGL( (poly_floyd_warshall_lam), @@ -172,7 +172,7 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariant(VariantID vid) POLYBENCH_FLOYD_WARSHALL_TEARDOWN_HIP; } else { - std::cout << "\n POLYBENCH_FLOYD_WARSHALL : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_FLOYD_WARSHALL : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp index 974fa2342..f3cfc0466 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp @@ -18,12 +18,12 @@ //#define USE_RAJA_OMP_COLLAPSE #undef USE_RAJA_OMP_COLLAPSE -namespace rajaperf +namespace rajaperf { namespace polybench { - + void POLYBENCH_FLOYD_WARSHALL::runOpenMPVariant(VariantID vid) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -45,7 +45,7 @@ void POLYBENCH_FLOYD_WARSHALL::runOpenMPVariant(VariantID vid) #else #pragma omp parallel for #endif - for (Index_type i = 0; i < N; ++i) { + for (Index_type i = 0; i < N; ++i) { for (Index_type j = 0; j < N; ++j) { POLYBENCH_FLOYD_WARSHALL_BODY; } @@ -60,7 +60,7 @@ void POLYBENCH_FLOYD_WARSHALL::runOpenMPVariant(VariantID vid) case Lambda_OpenMP : { - auto poly_floydwarshall_base_lam = [=](Index_type k, Index_type i, + auto poly_floydwarshall_base_lam = [=](Index_type k, Index_type i, Index_type j) { POLYBENCH_FLOYD_WARSHALL_BODY; }; @@ -89,9 +89,9 @@ void POLYBENCH_FLOYD_WARSHALL::runOpenMPVariant(VariantID vid) case RAJA_OpenMP : { - POLYBENCH_FLOYD_WARSHALL_VIEWS_RAJA; + POLYBENCH_FLOYD_WARSHALL_VIEWS_RAJA; - auto poly_floydwarshall_lam = [=](Index_type k, Index_type i, + auto poly_floydwarshall_lam = [=](Index_type k, Index_type i, Index_type j) { POLYBENCH_FLOYD_WARSHALL_BODY_RAJA; }; @@ -125,7 +125,7 @@ void POLYBENCH_FLOYD_WARSHALL::runOpenMPVariant(VariantID vid) RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment{0, N}, RAJA::RangeSegment{0, N}, RAJA::RangeSegment{0, N}), - poly_floydwarshall_lam + poly_floydwarshall_lam ); } @@ -135,12 +135,12 @@ void POLYBENCH_FLOYD_WARSHALL::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n POLYBENCH_FLOYD_WARSHALL : Unknown variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_FLOYD_WARSHALL : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp index d59441b7a..c2e864b93 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp @@ -4,7 +4,7 @@ // See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "POLYBENCH_FLOYD_WARSHALL.hpp" @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -98,7 +98,7 @@ void POLYBENCH_FLOYD_WARSHALL::runOpenMPTargetVariant(VariantID vid) POLYBENCH_FLOYD_WARSHALL_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n POLYBENCH_FLOYD_WARSHALL : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_FLOYD_WARSHALL : Unknown OMP Target variant id = " << vid << std::endl; } } @@ -107,4 +107,4 @@ void POLYBENCH_FLOYD_WARSHALL::runOpenMPTargetVariant(VariantID vid) } // end namespace rajaperf #endif // RAJA_ENABLE_TARGET_OPENMP - + diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp index 1a698dace..cfe5ef88e 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp @@ -12,12 +12,12 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { - + void POLYBENCH_FLOYD_WARSHALL::runSeqVariant(VariantID vid) { const Index_type run_reps= getRunReps(); @@ -31,9 +31,9 @@ void POLYBENCH_FLOYD_WARSHALL::runSeqVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - for (Index_type k = 0; k < N; ++k) { - for (Index_type i = 0; i < N; ++i) { - for (Index_type j = 0; j < N; ++j) { + for (Index_type k = 0; k < N; ++k) { + for (Index_type i = 0; i < N; ++i) { + for (Index_type j = 0; j < N; ++j) { POLYBENCH_FLOYD_WARSHALL_BODY; } } @@ -49,7 +49,7 @@ void POLYBENCH_FLOYD_WARSHALL::runSeqVariant(VariantID vid) #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { - auto poly_floydwarshall_base_lam = [=](Index_type k, Index_type i, + auto poly_floydwarshall_base_lam = [=](Index_type k, Index_type i, Index_type j) { POLYBENCH_FLOYD_WARSHALL_BODY; }; @@ -73,9 +73,9 @@ void POLYBENCH_FLOYD_WARSHALL::runSeqVariant(VariantID vid) case RAJA_Seq : { - POLYBENCH_FLOYD_WARSHALL_VIEWS_RAJA; + POLYBENCH_FLOYD_WARSHALL_VIEWS_RAJA; - auto poly_floydwarshall_lam = [=](Index_type k, Index_type i, + auto poly_floydwarshall_lam = [=](Index_type k, Index_type i, Index_type j) { POLYBENCH_FLOYD_WARSHALL_BODY_RAJA; }; @@ -97,7 +97,7 @@ void POLYBENCH_FLOYD_WARSHALL::runSeqVariant(VariantID vid) RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment{0, N}, RAJA::RangeSegment{0, N}, RAJA::RangeSegment{0, N}), - poly_floydwarshall_lam + poly_floydwarshall_lam ); } @@ -108,7 +108,7 @@ void POLYBENCH_FLOYD_WARSHALL::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n POLYBENCH_FLOYD_WARSHALL : Unknown variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_FLOYD_WARSHALL : Unknown variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_GEMM-Cuda.cpp b/src/polybench/POLYBENCH_GEMM-Cuda.cpp index ae586d1f4..994395c39 100644 --- a/src/polybench/POLYBENCH_GEMM-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMM-Cuda.cpp @@ -93,7 +93,7 @@ void POLYBENCH_GEMM::runCudaVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { POLY_GEMM_THREADS_PER_BLOCK_CUDA; - POLY_GEMM_NBLOCKS_CUDA; + POLY_GEMM_NBLOCKS_CUDA; poly_gemm<<>>(C, A, B, alpha, beta, @@ -192,7 +192,7 @@ void POLYBENCH_GEMM::runCudaVariant(VariantID vid) POLYBENCH_GEMM_TEARDOWN_CUDA; } else { - std::cout << "\n POLYBENCH_GEMM : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_GEMM : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_GEMM-Hip.cpp b/src/polybench/POLYBENCH_GEMM-Hip.cpp index 2d07f0a86..a4044ccb2 100644 --- a/src/polybench/POLYBENCH_GEMM-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMM-Hip.cpp @@ -93,9 +93,9 @@ void POLYBENCH_GEMM::runHipVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { POLY_GEMM_THREADS_PER_BLOCK_HIP; - POLY_GEMM_NBLOCKS_HIP; + POLY_GEMM_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_gemm), + hipLaunchKernelGGL((poly_gemm), dim3(nblocks), dim3(nthreads_per_block), 0, 0, C, A, B, alpha, beta, ni, nj, nk); @@ -114,7 +114,7 @@ void POLYBENCH_GEMM::runHipVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { POLY_GEMM_THREADS_PER_BLOCK_HIP; - POLY_GEMM_NBLOCKS_HIP; + POLY_GEMM_NBLOCKS_HIP; auto poly_gemm_lambda = [=] __device__ (Index_type i, Index_type j) { POLYBENCH_GEMM_BODY1; @@ -125,7 +125,7 @@ void POLYBENCH_GEMM::runHipVariant(VariantID vid) POLYBENCH_GEMM_BODY4; }; - hipLaunchKernelGGL((poly_gemm_lam), + hipLaunchKernelGGL((poly_gemm_lam), dim3(nblocks), dim3(nthreads_per_block), 0, 0, ni, nj, poly_gemm_lambda); hipErrchk( hipGetLastError() ); @@ -195,7 +195,7 @@ void POLYBENCH_GEMM::runHipVariant(VariantID vid) POLYBENCH_GEMM_TEARDOWN_HIP; } else { - std::cout << "\n POLYBENCH_GEMM : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_GEMM : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_GEMM-OMP.cpp b/src/polybench/POLYBENCH_GEMM-OMP.cpp index ece6d4b22..9195af832 100644 --- a/src/polybench/POLYBENCH_GEMM-OMP.cpp +++ b/src/polybench/POLYBENCH_GEMM-OMP.cpp @@ -13,7 +13,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -70,7 +70,7 @@ void POLYBENCH_GEMM::runOpenMPVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { #pragma omp parallel for collapse(2) - for (Index_type i = 0; i < ni; ++i ) { + for (Index_type i = 0; i < ni; ++i ) { for (Index_type j = 0; j < nj; ++j ) { POLYBENCH_GEMM_BODY1; poly_gemm_base_lam2(i, j); @@ -123,7 +123,7 @@ void POLYBENCH_GEMM::runOpenMPVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { RAJA::kernel_param( - + RAJA::make_tuple( RAJA::RangeSegment{0, ni}, RAJA::RangeSegment{0, nj}, RAJA::RangeSegment{0, nk} ), @@ -143,12 +143,12 @@ void POLYBENCH_GEMM::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n POLYBENCH_GEMM : Unknown variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_GEMM : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp b/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp index 63314c8b3..0570e3ad3 100644 --- a/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp @@ -4,7 +4,7 @@ // See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "POLYBENCH_GEMM.hpp" @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -103,11 +103,11 @@ void POLYBENCH_GEMM::runOpenMPTargetVariant(VariantID vid) [=] (Index_type i, Index_type j) { POLYBENCH_GEMM_BODY2_RAJA; }, - [=] (Index_type i, Index_type j, Index_type k, + [=] (Index_type i, Index_type j, Index_type k, Real_type& dot) { POLYBENCH_GEMM_BODY3_RAJA; }, - [=] (Index_type i, Index_type j, + [=] (Index_type i, Index_type j, Real_type& dot) { POLYBENCH_GEMM_BODY4_RAJA; } @@ -119,7 +119,7 @@ void POLYBENCH_GEMM::runOpenMPTargetVariant(VariantID vid) POLYBENCH_GEMM_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n POLYBENCH_GEMM : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_GEMM : Unknown OMP Target variant id = " << vid << std::endl; } } @@ -128,4 +128,4 @@ void POLYBENCH_GEMM::runOpenMPTargetVariant(VariantID vid) } // end namespace rajaperf #endif // RAJA_ENABLE_TARGET_OPENMP - + diff --git a/src/polybench/POLYBENCH_GEMM-Seq.cpp b/src/polybench/POLYBENCH_GEMM-Seq.cpp index fe42374fb..e28973b2b 100644 --- a/src/polybench/POLYBENCH_GEMM-Seq.cpp +++ b/src/polybench/POLYBENCH_GEMM-Seq.cpp @@ -13,7 +13,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -32,7 +32,7 @@ void POLYBENCH_GEMM::runSeqVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - for (Index_type i = 0; i < ni; ++i ) { + for (Index_type i = 0; i < ni; ++i ) { for (Index_type j = 0; j < nj; ++j ) { POLYBENCH_GEMM_BODY1; POLYBENCH_GEMM_BODY2; @@ -94,7 +94,7 @@ void POLYBENCH_GEMM::runSeqVariant(VariantID vid) auto poly_gemm_lam2 = [=](Index_type i, Index_type j) { POLYBENCH_GEMM_BODY2_RAJA; }; - auto poly_gemm_lam3 = [=](Index_type i, Index_type j, Index_type k, + auto poly_gemm_lam3 = [=](Index_type i, Index_type j, Index_type k, Real_type& dot) { POLYBENCH_GEMM_BODY3_RAJA; }; @@ -121,7 +121,7 @@ void POLYBENCH_GEMM::runSeqVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { RAJA::kernel_param( - + RAJA::make_tuple( RAJA::RangeSegment{0, ni}, RAJA::RangeSegment{0, nj}, RAJA::RangeSegment{0, nk} ), @@ -142,7 +142,7 @@ void POLYBENCH_GEMM::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n POLYBENCH_GEMM : Unknown variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_GEMM : Unknown variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp index 8d2ddca87..5e2bae18e 100644 --- a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp @@ -250,7 +250,7 @@ void POLYBENCH_GEMVER::runCudaVariant(VariantID vid) > > > - >; + >; using EXEC_POL24 = RAJA::KernelPolicy< @@ -324,7 +324,7 @@ void POLYBENCH_GEMVER::runCudaVariant(VariantID vid) POLYBENCH_GEMVER_TEARDOWN_CUDA; } else { - std::cout << "\n POLYBENCH_GEMVER : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_GEMVER : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_GEMVER-Hip.cpp b/src/polybench/POLYBENCH_GEMVER-Hip.cpp index 469f620a3..4ff4dafc9 100644 --- a/src/polybench/POLYBENCH_GEMVER-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Hip.cpp @@ -151,24 +151,24 @@ void POLYBENCH_GEMVER::runHipVariant(VariantID vid) GEMVER_THREADS_PER_BLOCK_HIP; GEMVER_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_gemmver_1), + hipLaunchKernelGGL((poly_gemmver_1), dim3(nblocks1), dim3(nthreads_per_block1), 0, 0, A, u1, v1, u2, v2, n); hipErrchk( hipGetLastError() ); size_t grid_size = RAJA_DIVIDE_CEILING_INT(m_n, block_size); - hipLaunchKernelGGL((poly_gemmver_2), + hipLaunchKernelGGL((poly_gemmver_2), dim3(grid_size), dim3(block_size), 0, 0, A, x, y, beta, n); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_gemmver_3), + hipLaunchKernelGGL((poly_gemmver_3), dim3(grid_size), dim3(block_size), 0, 0, x, z, n); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_gemmver_4), + hipLaunchKernelGGL((poly_gemmver_4), dim3(grid_size), dim3(block_size), 0, 0, A, x, w, alpha, n); hipErrchk( hipGetLastError() ); @@ -194,7 +194,7 @@ void POLYBENCH_GEMVER::runHipVariant(VariantID vid) hipLaunchKernelGGL(poly_gemmver_1_lam, dim3(nblocks1), dim3(nthreads_per_block1), 0, 0, - n, poly_gemmver_1_lambda); + n, poly_gemmver_1_lambda); hipErrchk( hipGetLastError() ); size_t grid_size = RAJA_DIVIDE_CEILING_INT(n, block_size); @@ -260,7 +260,7 @@ void POLYBENCH_GEMVER::runHipVariant(VariantID vid) > > > - >; + >; using EXEC_POL24 = RAJA::KernelPolicy< @@ -334,7 +334,7 @@ void POLYBENCH_GEMVER::runHipVariant(VariantID vid) POLYBENCH_GEMVER_TEARDOWN_HIP; } else { - std::cout << "\n POLYBENCH_GEMVER : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_GEMVER : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_GEMVER-OMP.cpp b/src/polybench/POLYBENCH_GEMVER-OMP.cpp index ba3ad5457..3b5d911b9 100644 --- a/src/polybench/POLYBENCH_GEMVER-OMP.cpp +++ b/src/polybench/POLYBENCH_GEMVER-OMP.cpp @@ -14,7 +14,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -49,14 +49,14 @@ void POLYBENCH_GEMVER::runOpenMPVariant(VariantID vid) POLYBENCH_GEMVER_BODY3; } POLYBENCH_GEMVER_BODY4; - } + } - #pragma omp parallel for + #pragma omp parallel for for (Index_type i = 0; i < n; i++ ) { POLYBENCH_GEMVER_BODY5; } - #pragma omp parallel for + #pragma omp parallel for for (Index_type i = 0; i < n; i++ ) { POLYBENCH_GEMVER_BODY6; for (Index_type j = 0; j < n; j++) { @@ -215,7 +215,7 @@ void POLYBENCH_GEMVER::runOpenMPVariant(VariantID vid) poly_gemver_lam7, poly_gemver_lam8 - ); + ); } stopTimer(); @@ -224,12 +224,12 @@ void POLYBENCH_GEMVER::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n POLYBENCH_GEMVER : Unknown variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_GEMVER : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp b/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp index 7b13712f1..5b256729e 100644 --- a/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp @@ -38,7 +38,7 @@ namespace polybench allocAndInitOpenMPDeviceData(w, m_w, m_n, did, hid); \ allocAndInitOpenMPDeviceData(x, m_x, m_n, did, hid); \ allocAndInitOpenMPDeviceData(y, m_y, m_n, did, hid); \ - allocAndInitOpenMPDeviceData(z, m_z, m_n, did, hid); + allocAndInitOpenMPDeviceData(z, m_z, m_n, did, hid); #define POLYBENCH_GEMVER_DATA_TEARDOWN_OMP_TARGET \ getOpenMPDeviceData(m_w, w, m_n, hid, did); \ @@ -50,9 +50,9 @@ namespace polybench deallocOpenMPDeviceData(w, did); \ deallocOpenMPDeviceData(x, did); \ deallocOpenMPDeviceData(y, did); \ - deallocOpenMPDeviceData(z, did); + deallocOpenMPDeviceData(z, did); + - void POLYBENCH_GEMVER::runOpenMPTargetVariant(VariantID vid) { @@ -77,7 +77,7 @@ void POLYBENCH_GEMVER::runOpenMPTargetVariant(VariantID vid) #pragma omp target is_device_ptr(A,x,y) device( did ) #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) - for (Index_type i = 0; i < n; i++) { + for (Index_type i = 0; i < n; i++) { POLYBENCH_GEMVER_BODY2; for (Index_type j = 0; j < n; j++) { POLYBENCH_GEMVER_BODY3; @@ -86,7 +86,7 @@ void POLYBENCH_GEMVER::runOpenMPTargetVariant(VariantID vid) } #pragma omp target is_device_ptr(x,z) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) for (Index_type i = 0; i < n; i++) { POLYBENCH_GEMVER_BODY5; } @@ -102,7 +102,7 @@ void POLYBENCH_GEMVER::runOpenMPTargetVariant(VariantID vid) } } // end run_reps - stopTimer(); + stopTimer(); POLYBENCH_GEMVER_DATA_TEARDOWN_OMP_TARGET; @@ -187,7 +187,7 @@ void POLYBENCH_GEMVER::runOpenMPTargetVariant(VariantID vid) POLYBENCH_GEMVER_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n POLYBENCH_GEMVER : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_GEMVER : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_GEMVER-Seq.cpp b/src/polybench/POLYBENCH_GEMVER-Seq.cpp index 9498d1355..096dd9d56 100644 --- a/src/polybench/POLYBENCH_GEMVER-Seq.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Seq.cpp @@ -14,7 +14,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -39,7 +39,7 @@ void POLYBENCH_GEMVER::runSeqVariant(VariantID vid) } } - for (Index_type i = 0; i < n; i++ ) { + for (Index_type i = 0; i < n; i++ ) { POLYBENCH_GEMVER_BODY2; for (Index_type j = 0; j < n; j++) { POLYBENCH_GEMVER_BODY3; @@ -47,11 +47,11 @@ void POLYBENCH_GEMVER::runSeqVariant(VariantID vid) POLYBENCH_GEMVER_BODY4; } - for (Index_type i = 0; i < n; i++ ) { + for (Index_type i = 0; i < n; i++ ) { POLYBENCH_GEMVER_BODY5; } - for (Index_type i = 0; i < n; i++ ) { + for (Index_type i = 0; i < n; i++ ) { POLYBENCH_GEMVER_BODY6; for (Index_type j = 0; j < n; j++) { POLYBENCH_GEMVER_BODY7; @@ -71,7 +71,7 @@ void POLYBENCH_GEMVER::runSeqVariant(VariantID vid) auto poly_gemver_base_lam1 = [=](Index_type i, Index_type j) { POLYBENCH_GEMVER_BODY1; }; - auto poly_gemver_base_lam3 = [=](Index_type i, Index_type j, + auto poly_gemver_base_lam3 = [=](Index_type i, Index_type j, Real_type &dot) { POLYBENCH_GEMVER_BODY3; }; @@ -81,7 +81,7 @@ void POLYBENCH_GEMVER::runSeqVariant(VariantID vid) auto poly_gemver_base_lam5 = [=](Index_type i) { POLYBENCH_GEMVER_BODY5; }; - auto poly_gemver_base_lam7 = [=](Index_type i, Index_type j, + auto poly_gemver_base_lam7 = [=](Index_type i, Index_type j, Real_type &dot) { POLYBENCH_GEMVER_BODY7; }; @@ -182,8 +182,8 @@ void POLYBENCH_GEMVER::runSeqVariant(VariantID vid) RAJA::RangeSegment{0, n}), poly_gemver_lam1 ); - - RAJA::kernel_param( + + RAJA::kernel_param( RAJA::make_tuple(RAJA::RangeSegment{0, n}, RAJA::RangeSegment{0, n}), RAJA::tuple{0.0}, @@ -192,12 +192,12 @@ void POLYBENCH_GEMVER::runSeqVariant(VariantID vid) poly_gemver_lam3, poly_gemver_lam4 ); - - RAJA::forall (RAJA::RangeSegment{0, n}, + + RAJA::forall (RAJA::RangeSegment{0, n}, poly_gemver_lam5 ); - RAJA::kernel_param( + RAJA::kernel_param( RAJA::make_tuple(RAJA::RangeSegment{0, n}, RAJA::RangeSegment{0, n}), RAJA::tuple{0.0}, @@ -207,16 +207,16 @@ void POLYBENCH_GEMVER::runSeqVariant(VariantID vid) poly_gemver_lam8 ); - + } stopTimer(); - + break; } #endif // RUN_RAJA_SEQ default : { - std::cout << "\n POLYBENCH_GEMVER : Unknown variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_GEMVER : Unknown variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp index 3fdac4fd8..b9f29ce28 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp @@ -135,7 +135,7 @@ void POLYBENCH_GESUMMV::runCudaVariant(VariantID vid) POLYBENCH_GESUMMV_TEARDOWN_CUDA; } else { - std::cout << "\n POLYBENCH_GESUMMV : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_GESUMMV : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp index 1fec5379b..e4e30ce76 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp @@ -73,7 +73,7 @@ void POLYBENCH_GESUMMV::runHipVariant(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); - hipLaunchKernelGGL((poly_gesummv), + hipLaunchKernelGGL((poly_gesummv), dim3(grid_size), dim3(block_size),0,0, x, y, A, B, @@ -137,7 +137,7 @@ void POLYBENCH_GESUMMV::runHipVariant(VariantID vid) POLYBENCH_GESUMMV_TEARDOWN_HIP; } else { - std::cout << "\n POLYBENCH_GESUMMV : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_GESUMMV : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_GESUMMV-OMP.cpp b/src/polybench/POLYBENCH_GESUMMV-OMP.cpp index 8e46f7691..4fc4896ed 100644 --- a/src/polybench/POLYBENCH_GESUMMV-OMP.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-OMP.cpp @@ -13,12 +13,12 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { - + void POLYBENCH_GESUMMV::runOpenMPVariant(VariantID vid) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -111,7 +111,7 @@ void POLYBENCH_GESUMMV::runOpenMPVariant(VariantID vid) RAJA::kernel_param( RAJA::make_tuple( RAJA::RangeSegment{0, N}, RAJA::RangeSegment{0, N} ), - RAJA::make_tuple(static_cast(0.0), + RAJA::make_tuple(static_cast(0.0), static_cast(0.0)), poly_gesummv_lam1, @@ -126,12 +126,12 @@ void POLYBENCH_GESUMMV::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n POLYBENCH_GESUMMV : Unknown variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_GESUMMV : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp b/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp index 5fd39ea33..299d4b347 100644 --- a/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp @@ -4,7 +4,7 @@ // See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "POLYBENCH_GESUMMV.hpp" @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -118,7 +118,7 @@ void POLYBENCH_GESUMMV::runOpenMPTargetVariant(VariantID vid) POLYBENCH_GESUMMV_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n POLYBENCH_GESUMMV : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_GESUMMV : Unknown OMP Target variant id = " << vid << std::endl; } } @@ -127,4 +127,4 @@ void POLYBENCH_GESUMMV::runOpenMPTargetVariant(VariantID vid) } // end namespace rajaperf #endif // RAJA_ENABLE_TARGET_OPENMP - + diff --git a/src/polybench/POLYBENCH_GESUMMV-Seq.cpp b/src/polybench/POLYBENCH_GESUMMV-Seq.cpp index c769da219..4a488029b 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Seq.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Seq.cpp @@ -13,7 +13,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -31,7 +31,7 @@ void POLYBENCH_GESUMMV::runSeqVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - for (Index_type i = 0; i < N; ++i ) { + for (Index_type i = 0; i < N; ++i ) { POLYBENCH_GESUMMV_BODY1; for (Index_type j = 0; j < N; ++j ) { POLYBENCH_GESUMMV_BODY2; @@ -49,7 +49,7 @@ void POLYBENCH_GESUMMV::runSeqVariant(VariantID vid) #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { - auto poly_gesummv_base_lam2 = [=](Index_type i, Index_type j, + auto poly_gesummv_base_lam2 = [=](Index_type i, Index_type j, Real_type& tmpdot, Real_type& ydot) { POLYBENCH_GESUMMV_BODY2; }; @@ -82,7 +82,7 @@ void POLYBENCH_GESUMMV::runSeqVariant(VariantID vid) auto poly_gesummv_lam1 = [=](Real_type& tmpdot, Real_type& ydot) { POLYBENCH_GESUMMV_BODY1_RAJA; }; - auto poly_gesummv_lam2 = [=](Index_type i, Index_type j, + auto poly_gesummv_lam2 = [=](Index_type i, Index_type j, Real_type& tmpdot, Real_type& ydot) { POLYBENCH_GESUMMV_BODY2_RAJA; }; @@ -108,7 +108,7 @@ void POLYBENCH_GESUMMV::runSeqVariant(VariantID vid) RAJA::kernel_param( RAJA::make_tuple( RAJA::RangeSegment{0, N}, RAJA::RangeSegment{0, N} ), - RAJA::make_tuple(static_cast(0.0), + RAJA::make_tuple(static_cast(0.0), static_cast(0.0)), poly_gesummv_lam1, @@ -124,7 +124,7 @@ void POLYBENCH_GESUMMV::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n POLYBENCH_GESUMMV : Unknown variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_GESUMMV : Unknown variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp index aaad94dcf..17eb3bb88 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp @@ -203,7 +203,7 @@ void POLYBENCH_HEAT_3D::runCudaVariant(VariantID vid) POLYBENCH_HEAT_3D_TEARDOWN_CUDA; } else { - std::cout << "\n POLYBENCH_HEAT_3D : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_HEAT_3D : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp index c76e9cfe1..048d2ab43 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp @@ -102,13 +102,13 @@ void POLYBENCH_HEAT_3D::runHipVariant(VariantID vid) HEAT_3D_THREADS_PER_BLOCK_HIP; HEAT_3D_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_heat_3D_1), + hipLaunchKernelGGL((poly_heat_3D_1), dim3(nblocks), dim3(nthreads_per_block), 0, 0, A, B, N); hipErrchk( hipGetLastError() ); hipLaunchKernelGGL((poly_heat_3D_2), - dim3(nblocks), dim3(nthreads_per_block), 0, 0, + dim3(nblocks), dim3(nthreads_per_block), 0, 0, A, B, N); hipErrchk( hipGetLastError() ); @@ -211,7 +211,7 @@ void POLYBENCH_HEAT_3D::runHipVariant(VariantID vid) POLYBENCH_HEAT_3D_TEARDOWN_HIP; } else { - std::cout << "\n POLYBENCH_HEAT_3D : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_HEAT_3D : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp b/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp index 21d454911..7bf354c65 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp @@ -13,12 +13,12 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { - + void POLYBENCH_HEAT_3D::runOpenMPVariant(VariantID vid) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -149,17 +149,17 @@ void POLYBENCH_HEAT_3D::runOpenMPVariant(VariantID vid) stopTimer(); POLYBENCH_HEAT_3D_DATA_RESET; - + break; } default : { - std::cout << "\n POLYBENCH_HEAT_3D : Unknown variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_HEAT_3D : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp b/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp index 765a931d2..bc6fe97aa 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp @@ -4,7 +4,7 @@ // See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "POLYBENCH_HEAT_3D.hpp" @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -121,7 +121,7 @@ void POLYBENCH_HEAT_3D::runOpenMPTargetVariant(VariantID vid) POLYBENCH_HEAT_3D_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n POLYBENCH_HEAT_3D : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_HEAT_3D : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp b/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp index ce2d76435..2a93d39b8 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp @@ -13,7 +13,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -32,19 +32,19 @@ void POLYBENCH_HEAT_3D::runSeqVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - for (Index_type t = 0; t < tsteps; ++t) { + for (Index_type t = 0; t < tsteps; ++t) { - for (Index_type i = 1; i < N-1; ++i ) { - for (Index_type j = 1; j < N-1; ++j ) { - for (Index_type k = 1; k < N-1; ++k ) { + for (Index_type i = 1; i < N-1; ++i ) { + for (Index_type j = 1; j < N-1; ++j ) { + for (Index_type k = 1; k < N-1; ++k ) { POLYBENCH_HEAT_3D_BODY1; } } } - for (Index_type i = 1; i < N-1; ++i ) { - for (Index_type j = 1; j < N-1; ++j ) { - for (Index_type k = 1; k < N-1; ++k ) { + for (Index_type i = 1; i < N-1; ++i ) { + for (Index_type j = 1; j < N-1; ++j ) { + for (Index_type k = 1; k < N-1; ++k ) { POLYBENCH_HEAT_3D_BODY2; } } @@ -63,11 +63,11 @@ void POLYBENCH_HEAT_3D::runSeqVariant(VariantID vid) #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { - auto poly_heat3d_base_lam1 = [=](Index_type i, Index_type j, + auto poly_heat3d_base_lam1 = [=](Index_type i, Index_type j, Index_type k) { POLYBENCH_HEAT_3D_BODY1; }; - auto poly_heat3d_base_lam2 = [=](Index_type i, Index_type j, + auto poly_heat3d_base_lam2 = [=](Index_type i, Index_type j, Index_type k) { POLYBENCH_HEAT_3D_BODY2; }; @@ -155,7 +155,7 @@ void POLYBENCH_HEAT_3D::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n POLYBENCH_HEAT_3D : Unknown variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_HEAT_3D : Unknown variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp index 35f104444..c961478c5 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp @@ -116,7 +116,7 @@ void POLYBENCH_JACOBI_1D::runCudaVariant(VariantID vid) POLYBENCH_JACOBI_1D_TEARDOWN_CUDA; } else { - std::cout << "\n POLYBENCH_JACOBI_1D : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_JACOBI_1D : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp index d566cd430..eb7d4e1bb 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp @@ -118,7 +118,7 @@ void POLYBENCH_JACOBI_1D::runHipVariant(VariantID vid) POLYBENCH_JACOBI_1D_TEARDOWN_HIP; } else { - std::cout << "\n POLYBENCH_JACOBI_1D : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_JACOBI_1D : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp b/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp index d709d7d04..683cea5ba 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp @@ -13,12 +13,12 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { - + void POLYBENCH_JACOBI_1D::runOpenMPVariant(VariantID vid) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -114,12 +114,12 @@ void POLYBENCH_JACOBI_1D::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n POLYBENCH_JACOBI_1D : Unknown variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_JACOBI_1D : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp b/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp index 774289a68..d281ff310 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp @@ -4,7 +4,7 @@ // See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "POLYBENCH_JACOBI_1D.hpp" @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -55,7 +55,7 @@ void POLYBENCH_JACOBI_1D::runOpenMPTargetVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type t = 0; t < tsteps; ++t) { - + #pragma omp target is_device_ptr(A,B) device( did ) #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) for (Index_type i = 1; i < N-1; ++i ) { @@ -101,7 +101,7 @@ void POLYBENCH_JACOBI_1D::runOpenMPTargetVariant(VariantID vid) POLYBENCH_JACOBI_1D_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n POLYBENCH_JACOBI_1D : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_JACOBI_1D : Unknown OMP Target variant id = " << vid << std::endl; } } @@ -110,4 +110,4 @@ void POLYBENCH_JACOBI_1D::runOpenMPTargetVariant(VariantID vid) } // end namespace rajaperf #endif // RAJA_ENABLE_TARGET_OPENMP - + diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp index 3592b3daf..a022c2981 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp @@ -13,12 +13,12 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { - + void POLYBENCH_JACOBI_1D::runSeqVariant(VariantID vid) { const Index_type run_reps= getRunReps(); @@ -39,12 +39,12 @@ void POLYBENCH_JACOBI_1D::runSeqVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - for (Index_type t = 0; t < tsteps; ++t) { + for (Index_type t = 0; t < tsteps; ++t) { - for (Index_type i = 1; i < N-1; ++i ) { + for (Index_type i = 1; i < N-1; ++i ) { POLYBENCH_JACOBI_1D_BODY1; } - for (Index_type i = 1; i < N-1; ++i ) { + for (Index_type i = 1; i < N-1; ++i ) { POLYBENCH_JACOBI_1D_BODY2; } @@ -95,7 +95,7 @@ void POLYBENCH_JACOBI_1D::runSeqVariant(VariantID vid) poly_jacobi1d_lam1 ); - RAJA::forall ( RAJA::RangeSegment{1, N-1}, + RAJA::forall ( RAJA::RangeSegment{1, N-1}, poly_jacobi1d_lam2 ); @@ -111,7 +111,7 @@ void POLYBENCH_JACOBI_1D::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n POLYBENCH_JACOBI_1D : Unknown variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_JACOBI_1D : Unknown variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp index a32a9cce6..83b476abf 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp @@ -165,7 +165,7 @@ void POLYBENCH_JACOBI_2D::runCudaVariant(VariantID vid) > > > - >; + >; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -194,7 +194,7 @@ void POLYBENCH_JACOBI_2D::runCudaVariant(VariantID vid) POLYBENCH_JACOBI_2D_TEARDOWN_CUDA; } else { - std::cout << "\n POLYBENCH_JACOBI_2D : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_JACOBI_2D : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp index dd7230205..e58d7ea27 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp @@ -127,7 +127,7 @@ void POLYBENCH_JACOBI_2D::runHipVariant(VariantID vid) JACOBI_2D_THREADS_PER_BLOCK_HIP; JACOBI_2D_NBLOCKS_HIP; - auto poly_jacobi_2D_1_lambda = + auto poly_jacobi_2D_1_lambda = [=] __device__ (Index_type i, Index_type j) { POLYBENCH_JACOBI_2D_BODY1; }; @@ -137,7 +137,7 @@ void POLYBENCH_JACOBI_2D::runHipVariant(VariantID vid) N, poly_jacobi_2D_1_lambda); hipErrchk( hipGetLastError() ); - auto poly_jacobi_2D_2_lambda = + auto poly_jacobi_2D_2_lambda = [=] __device__ (Index_type i, Index_type j) { POLYBENCH_JACOBI_2D_BODY2; }; @@ -204,7 +204,7 @@ void POLYBENCH_JACOBI_2D::runHipVariant(VariantID vid) POLYBENCH_JACOBI_2D_TEARDOWN_HIP; } else { - std::cout << "\n POLYBENCH_JACOBI_2D : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_JACOBI_2D : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp b/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp index e8a7b80ea..69e6e1d13 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp @@ -13,7 +13,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -37,19 +37,19 @@ void POLYBENCH_JACOBI_2D::runOpenMPVariant(VariantID vid) for (Index_type t = 0; t < tsteps; ++t) { #pragma omp parallel for - for (Index_type i = 1; i < N-1; ++i ) { - for (Index_type j = 1; j < N-1; ++j ) { + for (Index_type i = 1; i < N-1; ++i ) { + for (Index_type j = 1; j < N-1; ++j ) { POLYBENCH_JACOBI_2D_BODY1; } } #pragma omp parallel for - for (Index_type i = 1; i < N-1; ++i ) { - for (Index_type j = 1; j < N-1; ++j ) { + for (Index_type i = 1; i < N-1; ++i ) { + for (Index_type j = 1; j < N-1; ++j ) { POLYBENCH_JACOBI_2D_BODY2; } } - + } } @@ -59,7 +59,7 @@ void POLYBENCH_JACOBI_2D::runOpenMPVariant(VariantID vid) break; } - + case Lambda_OpenMP : { auto poly_jacobi2d_base_lam1 = [=](Index_type i, Index_type j) { @@ -146,12 +146,12 @@ void POLYBENCH_JACOBI_2D::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n POLYBENCH_JACOBI_2D : Unknown variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_JACOBI_2D : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp b/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp index 91d5122cc..36afb8957 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp @@ -4,7 +4,7 @@ // See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "POLYBENCH_JACOBI_2D.hpp" @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -58,14 +58,14 @@ void POLYBENCH_JACOBI_2D::runOpenMPTargetVariant(VariantID vid) POLYBENCH_JACOBI_2D_BODY1; } } - + #pragma omp target is_device_ptr(A,B) device( did ) #pragma omp teams distribute parallel for schedule(static, 1) collapse(2) for (Index_type i = 1; i < N-1; ++i ) { for (Index_type j = 1; j < N-1; ++j ) { POLYBENCH_JACOBI_2D_BODY2; } - } + } } @@ -83,11 +83,11 @@ void POLYBENCH_JACOBI_2D::runOpenMPTargetVariant(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::Collapse, + RAJA::ArgList<0, 1>, RAJA::statement::Lambda<0> >, RAJA::statement::Collapse, + RAJA::ArgList<0, 1>, RAJA::statement::Lambda<1> > >; @@ -115,7 +115,7 @@ void POLYBENCH_JACOBI_2D::runOpenMPTargetVariant(VariantID vid) POLYBENCH_JACOBI_2D_TEARDOWN_CUDA; } else { - std::cout << "\n POLYBENCH_JACOBI_2D : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_JACOBI_2D : Unknown OMP Target variant id = " << vid << std::endl; } } @@ -124,4 +124,4 @@ void POLYBENCH_JACOBI_2D::runOpenMPTargetVariant(VariantID vid) } // end namespace rajaperf #endif // RAJA_ENABLE_TARGET_OPENMP - + diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp index aef47da43..87e8e8e15 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp @@ -13,7 +13,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -32,15 +32,15 @@ void POLYBENCH_JACOBI_2D::runSeqVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - for (Index_type t = 0; t < tsteps; ++t) { + for (Index_type t = 0; t < tsteps; ++t) { - for (Index_type i = 1; i < N-1; ++i ) { - for (Index_type j = 1; j < N-1; ++j ) { + for (Index_type i = 1; i < N-1; ++i ) { + for (Index_type j = 1; j < N-1; ++j ) { POLYBENCH_JACOBI_2D_BODY1; } } - for (Index_type i = 1; i < N-1; ++i ) { - for (Index_type j = 1; j < N-1; ++j ) { + for (Index_type i = 1; i < N-1; ++i ) { + for (Index_type j = 1; j < N-1; ++j ) { POLYBENCH_JACOBI_2D_BODY2; } } @@ -142,7 +142,7 @@ void POLYBENCH_JACOBI_2D::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n POLYBENCH_JACOBI_2D : Unknown variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_JACOBI_2D : Unknown variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_MVT-Cuda.cpp b/src/polybench/POLYBENCH_MVT-Cuda.cpp index 2a59f018f..67c678be7 100644 --- a/src/polybench/POLYBENCH_MVT-Cuda.cpp +++ b/src/polybench/POLYBENCH_MVT-Cuda.cpp @@ -174,7 +174,7 @@ void POLYBENCH_MVT::runCudaVariant(VariantID vid) POLYBENCH_MVT_TEARDOWN_CUDA; } else { - std::cout << "\n POLYBENCH_MVT : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_MVT : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_MVT-Hip.cpp b/src/polybench/POLYBENCH_MVT-Hip.cpp index 00619eee5..488bc8885 100644 --- a/src/polybench/POLYBENCH_MVT-Hip.cpp +++ b/src/polybench/POLYBENCH_MVT-Hip.cpp @@ -172,7 +172,7 @@ void POLYBENCH_MVT::runHipVariant(VariantID vid) POLYBENCH_MVT_TEARDOWN_HIP; } else { - std::cout << "\n POLYBENCH_MVT : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_MVT : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/polybench/POLYBENCH_MVT-OMP.cpp b/src/polybench/POLYBENCH_MVT-OMP.cpp index 10b920848..0023d0684 100644 --- a/src/polybench/POLYBENCH_MVT-OMP.cpp +++ b/src/polybench/POLYBENCH_MVT-OMP.cpp @@ -13,12 +13,12 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { - + void POLYBENCH_MVT::runOpenMPVariant(VariantID vid) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -38,7 +38,7 @@ void POLYBENCH_MVT::runOpenMPVariant(VariantID vid) { #pragma omp for schedule(static) nowait - for (Index_type i = 0; i < N; ++i ) { + for (Index_type i = 0; i < N; ++i ) { POLYBENCH_MVT_BODY1; for (Index_type j = 0; j < N; ++j ) { POLYBENCH_MVT_BODY2; @@ -47,7 +47,7 @@ void POLYBENCH_MVT::runOpenMPVariant(VariantID vid) } #pragma omp for schedule(static) nowait - for (Index_type i = 0; i < N; ++i ) { + for (Index_type i = 0; i < N; ++i ) { POLYBENCH_MVT_BODY4; for (Index_type j = 0; j < N; ++j ) { POLYBENCH_MVT_BODY5; @@ -140,7 +140,7 @@ void POLYBENCH_MVT::runOpenMPVariant(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::For<0, RAJA::omp_for_nowait_static_exec< >, // i - RAJA::statement::Lambda<0, RAJA::Params<0>>, + RAJA::statement::Lambda<0, RAJA::Params<0>>, RAJA::statement::For<1, RAJA::loop_exec, // j RAJA::statement::Lambda<1, RAJA::Segs<0,1>, RAJA::Params<0>> >, @@ -157,22 +157,22 @@ void POLYBENCH_MVT::runOpenMPVariant(VariantID vid) RAJA::make_tuple(RAJA::RangeSegment{0, N}, RAJA::RangeSegment{0, N}), RAJA::tuple{0.0}, - + poly_mvt_lam1, poly_mvt_lam2, poly_mvt_lam3 - + ); RAJA::kernel_param( RAJA::make_tuple(RAJA::RangeSegment{0, N}, RAJA::RangeSegment{0, N}), RAJA::tuple{0.0}, - + poly_mvt_lam4, - poly_mvt_lam5, + poly_mvt_lam5, poly_mvt_lam6 - + ); }); // end omp parallel region @@ -184,12 +184,12 @@ void POLYBENCH_MVT::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n POLYBENCH_MVT : Unknown variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_MVT : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/polybench/POLYBENCH_MVT-OMPTarget.cpp b/src/polybench/POLYBENCH_MVT-OMPTarget.cpp index 4fe035e01..8eb198ea5 100644 --- a/src/polybench/POLYBENCH_MVT-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_MVT-OMPTarget.cpp @@ -4,7 +4,7 @@ // See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "POLYBENCH_MVT.hpp" @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -149,7 +149,7 @@ void POLYBENCH_MVT::runOpenMPTargetVariant(VariantID vid) POLYBENCH_MVT_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n POLYBENCH_MVT : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_MVT : Unknown OMP Target variant id = " << vid << std::endl; } } @@ -158,4 +158,4 @@ void POLYBENCH_MVT::runOpenMPTargetVariant(VariantID vid) } // end namespace rajaperf #endif // RAJA_ENABLE_TARGET_OPENMP - + diff --git a/src/polybench/POLYBENCH_MVT-Seq.cpp b/src/polybench/POLYBENCH_MVT-Seq.cpp index fd8be8659..80847c383 100644 --- a/src/polybench/POLYBENCH_MVT-Seq.cpp +++ b/src/polybench/POLYBENCH_MVT-Seq.cpp @@ -13,7 +13,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace polybench { @@ -32,7 +32,7 @@ void POLYBENCH_MVT::runSeqVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - for (Index_type i = 0; i < N; ++i ) { + for (Index_type i = 0; i < N; ++i ) { POLYBENCH_MVT_BODY1; for (Index_type j = 0; j < N; ++j ) { POLYBENCH_MVT_BODY2; @@ -40,7 +40,7 @@ void POLYBENCH_MVT::runSeqVariant(VariantID vid) POLYBENCH_MVT_BODY3; } - for (Index_type i = 0; i < N; ++i ) { + for (Index_type i = 0; i < N; ++i ) { POLYBENCH_MVT_BODY4; for (Index_type j = 0; j < N; ++j ) { POLYBENCH_MVT_BODY5; @@ -58,19 +58,19 @@ void POLYBENCH_MVT::runSeqVariant(VariantID vid) #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { - auto poly_mvt_base_lam2 = [=] (Index_type i, Index_type j, + auto poly_mvt_base_lam2 = [=] (Index_type i, Index_type j, Real_type &dot) { POLYBENCH_MVT_BODY2; }; - auto poly_mvt_base_lam3 = [=] (Index_type i, + auto poly_mvt_base_lam3 = [=] (Index_type i, Real_type &dot) { POLYBENCH_MVT_BODY3; }; - auto poly_mvt_base_lam5 = [=] (Index_type i, Index_type j, + auto poly_mvt_base_lam5 = [=] (Index_type i, Index_type j, Real_type &dot) { POLYBENCH_MVT_BODY5; }; - auto poly_mvt_base_lam6 = [=] (Index_type i, + auto poly_mvt_base_lam6 = [=] (Index_type i, Real_type &dot) { POLYBENCH_MVT_BODY6; }; @@ -125,9 +125,9 @@ void POLYBENCH_MVT::runSeqVariant(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< - RAJA::statement::For<0, RAJA::loop_exec, // i + RAJA::statement::For<0, RAJA::loop_exec, // i RAJA::statement::Lambda<0, RAJA::Params<0>>, - RAJA::statement::For<1, RAJA::loop_exec, // j + RAJA::statement::For<1, RAJA::loop_exec, // j RAJA::statement::Lambda<1, RAJA::Segs<0,1>, RAJA::Params<0>> >, RAJA::statement::Lambda<2, RAJA::Segs<0>, RAJA::Params<0>> @@ -139,26 +139,26 @@ void POLYBENCH_MVT::runSeqVariant(VariantID vid) RAJA::region( [=]() { - RAJA::kernel_param( + RAJA::kernel_param( RAJA::make_tuple(RAJA::RangeSegment{0, N}, RAJA::RangeSegment{0, N}), RAJA::tuple{0.0}, - + poly_mvt_lam1, poly_mvt_lam2, poly_mvt_lam3 - + ); - RAJA::kernel_param( + RAJA::kernel_param( RAJA::make_tuple(RAJA::RangeSegment{0, N}, RAJA::RangeSegment{0, N}), RAJA::tuple{0.0}, - + poly_mvt_lam4, - poly_mvt_lam5, + poly_mvt_lam5, poly_mvt_lam6 - + ); }); // end sequential region (for single-source code) @@ -171,7 +171,7 @@ void POLYBENCH_MVT::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n POLYBENCH_MVT : Unknown variant id = " << vid << std::endl; + getCout() << "\n POLYBENCH_MVT : Unknown variant id = " << vid << std::endl; } } diff --git a/src/stream/ADD-Cuda.cpp b/src/stream/ADD-Cuda.cpp index 4fc0a4f0e..8dab511cd 100644 --- a/src/stream/ADD-Cuda.cpp +++ b/src/stream/ADD-Cuda.cpp @@ -110,7 +110,7 @@ void ADD::runCudaVariant(VariantID vid) ADD_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n ADD : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n ADD : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/stream/ADD-Hip.cpp b/src/stream/ADD-Hip.cpp index 68b671a63..24f8dadf8 100644 --- a/src/stream/ADD-Hip.cpp +++ b/src/stream/ADD-Hip.cpp @@ -112,7 +112,7 @@ void ADD::runHipVariant(VariantID vid) ADD_DATA_TEARDOWN_HIP; } else { - std::cout << "\n ADD : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n ADD : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/stream/ADD-OMP.cpp b/src/stream/ADD-OMP.cpp index c73b5c5c9..137ce77a6 100644 --- a/src/stream/ADD-OMP.cpp +++ b/src/stream/ADD-OMP.cpp @@ -12,15 +12,15 @@ #include -namespace rajaperf +namespace rajaperf { namespace stream { - + void ADD::runOpenMPVariant(VariantID vid) { -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -81,12 +81,12 @@ void ADD::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n ADD : Unknown variant id = " << vid << std::endl; + getCout() << "\n ADD : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/stream/ADD-OMPTarget.cpp b/src/stream/ADD-OMPTarget.cpp index d83bf1507..9c367b1b0 100644 --- a/src/stream/ADD-OMPTarget.cpp +++ b/src/stream/ADD-OMPTarget.cpp @@ -84,7 +84,7 @@ void ADD::runOpenMPTargetVariant(VariantID vid) ADD_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n ADD : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n ADD : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/stream/ADD-Seq.cpp b/src/stream/ADD-Seq.cpp index 8b670fc0e..89f989d95 100644 --- a/src/stream/ADD-Seq.cpp +++ b/src/stream/ADD-Seq.cpp @@ -12,12 +12,12 @@ #include -namespace rajaperf +namespace rajaperf { namespace stream { - + void ADD::runSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -61,7 +61,7 @@ void ADD::runSeqVariant(VariantID vid) stopTimer(); break; - } + } case RAJA_Seq : { @@ -79,7 +79,7 @@ void ADD::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n ADD : Unknown variant id = " << vid << std::endl; + getCout() << "\n ADD : Unknown variant id = " << vid << std::endl; } } diff --git a/src/stream/COPY-Cuda.cpp b/src/stream/COPY-Cuda.cpp index 62afb9ad8..cb3da418b 100644 --- a/src/stream/COPY-Cuda.cpp +++ b/src/stream/COPY-Cuda.cpp @@ -108,7 +108,7 @@ void COPY::runCudaVariant(VariantID vid) COPY_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n COPY : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n COPY : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/stream/COPY-Hip.cpp b/src/stream/COPY-Hip.cpp index 124f880fc..5541a2339 100644 --- a/src/stream/COPY-Hip.cpp +++ b/src/stream/COPY-Hip.cpp @@ -110,7 +110,7 @@ void COPY::runHipVariant(VariantID vid) COPY_DATA_TEARDOWN_HIP; } else { - std::cout << "\n COPY : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n COPY : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/stream/COPY-OMP.cpp b/src/stream/COPY-OMP.cpp index 8c023ed3b..fe35d5288 100644 --- a/src/stream/COPY-OMP.cpp +++ b/src/stream/COPY-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace stream { @@ -81,12 +81,12 @@ void COPY::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n COPY : Unknown variant id = " << vid << std::endl; + getCout() << "\n COPY : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/stream/COPY-OMPTarget.cpp b/src/stream/COPY-OMPTarget.cpp index 8ba9d7ef3..010456eb0 100644 --- a/src/stream/COPY-OMPTarget.cpp +++ b/src/stream/COPY-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace stream { @@ -83,7 +83,7 @@ void COPY::runOpenMPTargetVariant(VariantID vid) COPY_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n COPY : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n COPY : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/stream/COPY-Seq.cpp b/src/stream/COPY-Seq.cpp index 72bd9485d..89f9cae33 100644 --- a/src/stream/COPY-Seq.cpp +++ b/src/stream/COPY-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace stream { @@ -79,7 +79,7 @@ void COPY::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n COPY : Unknown variant id = " << vid << std::endl; + getCout() << "\n COPY : Unknown variant id = " << vid << std::endl; } } diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index ebeb2ca3a..f4bbb92cf 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -131,7 +131,7 @@ void DOT::runCudaVariant(VariantID vid) DOT_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n DOT : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n DOT : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index 47d4ad9b5..45257f97a 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -131,7 +131,7 @@ void DOT::runHipVariant(VariantID vid) DOT_DATA_TEARDOWN_HIP; } else { - std::cout << "\n DOT : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n DOT : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/stream/DOT-OMP.cpp b/src/stream/DOT-OMP.cpp index 6d7c6b77d..24a29d9a0 100644 --- a/src/stream/DOT-OMP.cpp +++ b/src/stream/DOT-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace stream { @@ -95,12 +95,12 @@ void DOT::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n DOT : Unknown variant id = " << vid << std::endl; + getCout() << "\n DOT : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/stream/DOT-OMPTarget.cpp b/src/stream/DOT-OMPTarget.cpp index e74c50acc..1b4cb85cf 100644 --- a/src/stream/DOT-OMPTarget.cpp +++ b/src/stream/DOT-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace stream { @@ -90,7 +90,7 @@ void DOT::runOpenMPTargetVariant(VariantID vid) DOT_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n DOT : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n DOT : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/stream/DOT-Seq.cpp b/src/stream/DOT-Seq.cpp index 5baf26592..81cff4c1b 100644 --- a/src/stream/DOT-Seq.cpp +++ b/src/stream/DOT-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace stream { @@ -93,7 +93,7 @@ void DOT::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n DOT : Unknown variant id = " << vid << std::endl; + getCout() << "\n DOT : Unknown variant id = " << vid << std::endl; } } diff --git a/src/stream/MUL-Cuda.cpp b/src/stream/MUL-Cuda.cpp index 01ccf4956..2c38a5c7f 100644 --- a/src/stream/MUL-Cuda.cpp +++ b/src/stream/MUL-Cuda.cpp @@ -107,7 +107,7 @@ void MUL::runCudaVariant(VariantID vid) MUL_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n MUL : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n MUL : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/stream/MUL-Hip.cpp b/src/stream/MUL-Hip.cpp index bdb5ca0eb..c2b65d8a0 100644 --- a/src/stream/MUL-Hip.cpp +++ b/src/stream/MUL-Hip.cpp @@ -109,7 +109,7 @@ void MUL::runHipVariant(VariantID vid) MUL_DATA_TEARDOWN_HIP; } else { - std::cout << "\n MUL : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n MUL : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/stream/MUL-OMP.cpp b/src/stream/MUL-OMP.cpp index d5d552f8f..0b7f3cd85 100644 --- a/src/stream/MUL-OMP.cpp +++ b/src/stream/MUL-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace stream { @@ -81,12 +81,12 @@ void MUL::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n MUL : Unknown variant id = " << vid << std::endl; + getCout() << "\n MUL : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/stream/MUL-OMPTarget.cpp b/src/stream/MUL-OMPTarget.cpp index 53d018d64..8e5f52b35 100644 --- a/src/stream/MUL-OMPTarget.cpp +++ b/src/stream/MUL-OMPTarget.cpp @@ -16,7 +16,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace stream { @@ -82,7 +82,7 @@ void MUL::runOpenMPTargetVariant(VariantID vid) MUL_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n MUL : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n MUL : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/stream/MUL-Seq.cpp b/src/stream/MUL-Seq.cpp index bfb1154ce..69b548e69 100644 --- a/src/stream/MUL-Seq.cpp +++ b/src/stream/MUL-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace stream { @@ -79,7 +79,7 @@ void MUL::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n MUL : Unknown variant id = " << vid << std::endl; + getCout() << "\n MUL : Unknown variant id = " << vid << std::endl; } } diff --git a/src/stream/TRIAD-Cuda.cpp b/src/stream/TRIAD-Cuda.cpp index 19175d80d..d0908d6ff 100644 --- a/src/stream/TRIAD-Cuda.cpp +++ b/src/stream/TRIAD-Cuda.cpp @@ -110,7 +110,7 @@ void TRIAD::runCudaVariant(VariantID vid) TRIAD_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n TRIAD : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n TRIAD : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/stream/TRIAD-Hip.cpp b/src/stream/TRIAD-Hip.cpp index deaf20d0f..21ed4478c 100644 --- a/src/stream/TRIAD-Hip.cpp +++ b/src/stream/TRIAD-Hip.cpp @@ -112,7 +112,7 @@ void TRIAD::runHipVariant(VariantID vid) TRIAD_DATA_TEARDOWN_HIP; } else { - std::cout << "\n TRIAD : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n TRIAD : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/stream/TRIAD-OMP.cpp b/src/stream/TRIAD-OMP.cpp index 1d2060ccf..9ce330c00 100644 --- a/src/stream/TRIAD-OMP.cpp +++ b/src/stream/TRIAD-OMP.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace stream { @@ -81,12 +81,12 @@ void TRIAD::runOpenMPVariant(VariantID vid) } default : { - std::cout << "\n TRIAD : Unknown variant id = " << vid << std::endl; + getCout() << "\n TRIAD : Unknown variant id = " << vid << std::endl; } } -#else +#else RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/stream/TRIAD-OMPTarget.cpp b/src/stream/TRIAD-OMPTarget.cpp index 9d0d67145..404444366 100644 --- a/src/stream/TRIAD-OMPTarget.cpp +++ b/src/stream/TRIAD-OMPTarget.cpp @@ -84,7 +84,7 @@ void TRIAD::runOpenMPTargetVariant(VariantID vid) TRIAD_DATA_TEARDOWN_OMP_TARGET; } else { - std::cout << "\n TRIAD : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n TRIAD : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/stream/TRIAD-Seq.cpp b/src/stream/TRIAD-Seq.cpp index dfbac0188..7d7800556 100644 --- a/src/stream/TRIAD-Seq.cpp +++ b/src/stream/TRIAD-Seq.cpp @@ -12,7 +12,7 @@ #include -namespace rajaperf +namespace rajaperf { namespace stream { @@ -79,7 +79,7 @@ void TRIAD::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n TRIAD : Unknown variant id = " << vid << std::endl; + getCout() << "\n TRIAD : Unknown variant id = " << vid << std::endl; } } From a46c07024102805b1f83a66cbdbf68170d25702a Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 2 Nov 2021 17:38:02 -0700 Subject: [PATCH 164/392] Handle file output Not only rank 0 writes files --- src/common/Executor.cpp | 77 ++++++++++++++++++++--------------------- src/common/Executor.hpp | 10 ++++-- 2 files changed, 45 insertions(+), 42 deletions(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 7e3893d98..5d5b724e9 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -12,6 +12,10 @@ #include "common/KernelBase.hpp" #include "common/OutputUtils.hpp" +#ifdef RAJA_PERFSUITE_ENABLE_MPI +#include +#endif + // Warmup kernels to run first to help reduce startup overheads in timings #include "basic/DAXPY.hpp" #include "basic/REDUCE3_INT.hpp" @@ -789,41 +793,52 @@ void Executor::outputRunData() } out_fprefix = "./" + run_params.getOutputFilePrefix(); - string filename = out_fprefix + "-timing.csv"; - writeCSVReport(filename, CSVRepMode::Timing, 6 /* prec */); + unique_ptr file = openOutputFile(out_fprefix + "-timing.csv"); + writeCSVReport(*file, CSVRepMode::Timing, 6 /* prec */); if ( haveReferenceVariant() ) { - filename = out_fprefix + "-speedup.csv"; - writeCSVReport(filename, CSVRepMode::Speedup, 3 /* prec */); + file = openOutputFile(out_fprefix + "-speedup.csv"); + writeCSVReport(*file, CSVRepMode::Speedup, 3 /* prec */); } - filename = out_fprefix + "-checksum.txt"; - writeChecksumReport(filename); - - filename = out_fprefix + "-fom.csv"; - writeFOMReport(filename); + file = openOutputFile(out_fprefix + "-checksum.txt"); + writeChecksumReport(*file); - filename = out_fprefix + "-kernels.csv"; - ofstream file(filename.c_str(), ios::out | ios::trunc); - if ( !file ) { - cout << " ERROR: Can't open output file " << filename << endl; + { + vector fom_groups; + getFOMGroups(fom_groups); + if (!fom_groups.empty() ) { + file = openOutputFile(out_fprefix + "-fom.csv"); + writeFOMReport(*file, fom_groups); + } } - if ( file ) { + file = openOutputFile(out_fprefix + "-kernels.csv"); + if ( *file ) { bool to_file = true; - writeKernelInfoSummary(file, to_file); + writeKernelInfoSummary(*file, to_file); } } - -void Executor::writeCSVReport(const string& filename, CSVRepMode mode, - size_t prec) +unique_ptr Executor::openOutputFile(const string& filename) const { - ofstream file(filename.c_str(), ios::out | ios::trunc); - if ( !file ) { - cout << " ERROR: Can't open output file " << filename << endl; + int rank = 0; +#ifdef RAJA_PERFSUITE_ENABLE_MPI + MPI_Comm_rank(MPI_COMM_WORLD, &rank); +#endif + if (rank == 0) { + unique_ptr file(new ofstream(filename.c_str(), ios::out | ios::trunc)); + if ( !*file ) { + getCout() << " ERROR: Can't open output file " << filename << endl; + } + return file; } + return unique_ptr(makeNullStream()); +} +void Executor::writeCSVReport(ostream& file, CSVRepMode mode, + size_t prec) +{ if ( file ) { // @@ -897,19 +912,8 @@ void Executor::writeCSVReport(const string& filename, CSVRepMode mode, } -void Executor::writeFOMReport(const string& filename) +void Executor::writeFOMReport(ostream& file, vector& fom_groups) { - vector fom_groups; - getFOMGroups(fom_groups); - if (fom_groups.empty() ) { - return; - } - - ofstream file(filename.c_str(), ios::out | ios::trunc); - if ( !file ) { - cout << " ERROR: Can't open output file " << filename << endl; - } - if ( file ) { // @@ -1129,13 +1133,8 @@ void Executor::writeFOMReport(const string& filename) } -void Executor::writeChecksumReport(const string& filename) +void Executor::writeChecksumReport(ostream& file) { - ofstream file(filename.c_str(), ios::out | ios::trunc); - if ( !file ) { - cout << " ERROR: Can't open output file " << filename << endl; - } - if ( file ) { // diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp index 32e978f9a..a2b041590 100644 --- a/src/common/Executor.hpp +++ b/src/common/Executor.hpp @@ -13,6 +13,8 @@ #include "common/RunParams.hpp" #include +#include +#include #include #include @@ -58,19 +60,21 @@ class Executor std::vector variants; }; + std::unique_ptr openOutputFile(const std::string& filename) const; + bool haveReferenceVariant() { return reference_vid < NumVariants; } void writeKernelInfoSummary(std::ostream& str, bool to_file) const; - void writeCSVReport(const std::string& filename, CSVRepMode mode, + void writeCSVReport(std::ostream& file, CSVRepMode mode, size_t prec); std::string getReportTitle(CSVRepMode mode); long double getReportDataEntry(CSVRepMode mode, KernelBase* kern, VariantID vid); - void writeChecksumReport(const std::string& filename); + void writeChecksumReport(std::ostream& file); - void writeFOMReport(const std::string& filename); + void writeFOMReport(std::ostream& file, std::vector& fom_groups); void getFOMGroups(std::vector& fom_groups); RunParams run_params; From 9cd1b0f6f1f89d13ed136665f815f1c6db6b6ae2 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 2 Nov 2021 17:39:39 -0700 Subject: [PATCH 165/392] Handle timing and synchronization Start simply by adding an mpi barrier before starting the timer and before stopping the timer. This ensures the ranks start in sync and all are done before the final time is taken. --- src/common/KernelBase.hpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 8c2dfb799..6f2e40338 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -15,6 +15,9 @@ #include "common/RunParams.hpp" #include "RAJA/util/Timer.hpp" +#if defined(RAJA_PERFSUITE_ENABLE_MPI) +#include +#endif #if defined(RAJA_ENABLE_CUDA) #include "RAJA/policy/cuda/raja_cudaerrchk.hpp" #endif @@ -118,12 +121,18 @@ class KernelBase void startTimer() { synchronize(); +#ifdef RAJA_PERFSUITE_ENABLE_MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif timer.start(); } void stopTimer() { synchronize(); +#ifdef RAJA_PERFSUITE_ENABLE_MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif timer.stop(); recordExecTime(); } From 2dd503f52dc59d08d611c628646ae19eafd8d79c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 3 Nov 2021 16:10:45 -0700 Subject: [PATCH 166/392] Remove extra ifdef --- src/RAJAPerfSuiteDriver.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/RAJAPerfSuiteDriver.cpp b/src/RAJAPerfSuiteDriver.cpp index 70cbd604c..b79670a4e 100644 --- a/src/RAJAPerfSuiteDriver.cpp +++ b/src/RAJAPerfSuiteDriver.cpp @@ -19,9 +19,7 @@ int main( int argc, char** argv ) { #ifdef RAJA_PERFSUITE_ENABLE_MPI MPI_Init(&argc, &argv); -#endif -#ifdef RAJA_PERFSUITE_ENABLE_MPI int num_ranks; MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); rajaperf::getCout() << "\n\nRunning with " << num_ranks << " MPI ranks..." << std::endl; From 58f12a7cb5099b39e36024b51796cffca57b719b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 3 Nov 2021 16:18:21 -0700 Subject: [PATCH 167/392] Add MPI num ranks to Kernel Info file --- src/common/Executor.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 5d5b724e9..82b751e22 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -605,6 +605,15 @@ void Executor::reportRunSummary(ostream& str) const void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const { + if ( to_file ) { +#ifdef RAJA_PERFSUITE_ENABLE_MPI + int num_ranks; + MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); + str << "Kernels run on " << num_ranks << " MPI ranks" << endl; +#else + str << "Kernels run without MPI" << endl; +#endif + } // // Set up column headers and column widths for kernel summary output. From 7a70e374e99cb28f584dba2f943c6aab10a05358 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 4 Nov 2021 15:25:24 -0700 Subject: [PATCH 168/392] Output Max and stddev of checksum diff with MPI In the ChecksumReport get more stats about the checksum including the max and stddev of the error across ranks. --- src/common/Executor.cpp | 110 +++++++++++++++++++++++++++++++++++++--- src/common/RPTypes.hpp | 6 ++- 2 files changed, 106 insertions(+), 10 deletions(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 82b751e22..681e2b18e 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -1146,6 +1146,11 @@ void Executor::writeChecksumReport(ostream& file) { if ( file ) { +#ifdef RAJA_PERFSUITE_ENABLE_MPI + int num_ranks; + MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); +#endif + // // Set basic table formatting parameters. // @@ -1172,7 +1177,11 @@ void Executor::writeChecksumReport(ostream& file) // Print title. // file << equal_line << endl; - file << "Checksum Report " << endl; + file << "Checksum Report "; +#ifdef RAJA_PERFSUITE_ENABLE_MPI + file << "for " << num_ranks << " MPI ranks "; +#endif + file << endl; file << equal_line << endl; // @@ -1181,10 +1190,22 @@ void Executor::writeChecksumReport(ostream& file) file < checksums(variant_ids.size(), 0.0); + std::vector checksums_diff(variant_ids.size(), 0.0); for (size_t iv = 0; iv < variant_ids.size(); ++iv) { VariantID vid = variant_ids[iv]; if ( kern->wasVariantRun(vid) ) { - Checksum_type vcheck_sum = kern->getChecksum(vid); - Checksum_type diff = cksum_ref - kern->getChecksum(vid); + checksums[iv] = kern->getChecksum(vid); + checksums_diff[iv] = cksum_ref - kern->getChecksum(vid); + } + } + +#ifdef RAJA_PERFSUITE_ENABLE_MPI + if (Checksum_MPI_type == MPI_DATATYPE_NULL) { + getCout() << "Checksum_MPI_type is invalid" << endl; + } + + // get stats for checksums + std::vector checksums_sum(variant_ids.size(), 0.0); + MPI_Allreduce(checksums.data(), checksums_sum.data(), variant_ids.size(), + Checksum_MPI_type, MPI_SUM, MPI_COMM_WORLD); + + std::vector checksums_avg(variant_ids.size(), 0.0); + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + checksums_avg[iv] = checksums_sum[iv] / num_ranks; + } + + // get stats for checksums_abs_diff + std::vector checksums_abs_diff(variant_ids.size(), 0.0); + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + checksums_abs_diff[iv] = std::abs(checksums_diff[iv]); + } + std::vector checksums_abs_diff_min(variant_ids.size(), 0.0); + std::vector checksums_abs_diff_max(variant_ids.size(), 0.0); + std::vector checksums_abs_diff_sum(variant_ids.size(), 0.0); + + MPI_Allreduce(checksums_abs_diff.data(), checksums_abs_diff_min.data(), variant_ids.size(), + Checksum_MPI_type, MPI_MIN, MPI_COMM_WORLD); + MPI_Allreduce(checksums_abs_diff.data(), checksums_abs_diff_max.data(), variant_ids.size(), + Checksum_MPI_type, MPI_MAX, MPI_COMM_WORLD); + MPI_Allreduce(checksums_abs_diff.data(), checksums_abs_diff_sum.data(), variant_ids.size(), + Checksum_MPI_type, MPI_SUM, MPI_COMM_WORLD); + + std::vector checksums_abs_diff_avg(variant_ids.size(), 0.0); + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + checksums_abs_diff_avg[iv] = checksums_abs_diff_sum[iv] / num_ranks; + } + + std::vector checksums_abs_diff_diff2avg2(variant_ids.size(), 0.0); + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + checksums_abs_diff_diff2avg2[iv] = (checksums_abs_diff[iv] - checksums_abs_diff_avg[iv]) * + (checksums_abs_diff[iv] - checksums_abs_diff_avg[iv]) ; + } + + std::vector checksums_abs_diff_stddev(variant_ids.size(), 0.0); + MPI_Allreduce(checksums_abs_diff_diff2avg2.data(), checksums_abs_diff_stddev.data(), variant_ids.size(), + Checksum_MPI_type, MPI_SUM, MPI_COMM_WORLD); + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + checksums_abs_diff_stddev[iv] = std::sqrt(checksums_abs_diff_stddev[iv] / num_ranks) ; + } + +#endif + + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + VariantID vid = variant_ids[iv]; + + if ( kern->wasVariantRun(vid) ) { file < Date: Thu, 18 Nov 2021 09:14:49 -0800 Subject: [PATCH 169/392] Update CMakeLists.txt Co-authored-by: David Beckingsale --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c50ef5c38..8d3f1f69c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -81,7 +81,7 @@ set(RAJA_PERFSUITE_VERSION_PATCHLEVEL 0) set(RAJA_PERFSUITE_DEPENDS RAJA) -if (ENABLE_MPI) +if (RAJA_PERFSUITE_ENABLE_MPI) list(APPEND RAJA_PERFSUITE_DEPENDS mpi) endif() if (ENABLE_OPENMP) From ef8ffd037953a2b130a0375694431c38a0fe445a Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 19 Nov 2021 10:27:16 -0800 Subject: [PATCH 170/392] Update Readme with MPI discussion --- README.md | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4231202d2..e0ac95b61 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,14 @@ the "Stream" group contains kernels from the Babel Stream benchmark, the "Apps" group contains kernels extracted from real scientific computing applications, and so forth. +The suite can be run as a single process or with multiple processes when +configured with MPI support. Running with MPI in the same configuration used +by an hpc app allows the suite to gather performance data that is more relevant +for that hpc app than performance data gathered running single process. For +example running sequentially with one MPI rank per core vs running sequentially +with a single process yields different performance results on most multi-core +CPUs. + * * * Table of Contents @@ -122,6 +130,28 @@ on the command line if you run CMake directly or edit the script you are running to do this. Then, when the build completes, you can type `make test` to run the RAJA tests. +## Building with MPI + +Some of the provided configurations will build the Performance Suite with +MPI support enabled. For example, + +``` +> ./scripts/blueos_spectrum_nvcc_clang.sh rolling-release 10.2.89 sm_70 10.0.1 +> cd build_lc_blueos-spectrumrolling-release-nvcc10.2.89-sm_70-clang10.0.1 +> make -j +``` + +In general MPI support can be enabled by passing the `-DENABLE_MPI=On` option +to CMake and providing a mpi compiler wrapper via the +`-DMPI_CXX_COMPILER=/path/to/mpic++` option to CMake in addition to other CMake +options. For example, + +``` +> mkdir my-mpi-build +> cd my-mpi-build +> cmake -DENABLE_MPI=On -DMPI_CXX_COMPILER=/path/to/mpic++ ../ +> make -j +``` * * * @@ -169,6 +199,22 @@ input that the code does not know how to parse. Ill-formed input will be noted in the summary output. Hopefully, this will make it easy for users to correct erroneous usage, such as mis-spelled option names. +## Running with MPI + +Running the Suite with MPI is as simple as running any other MPI application. +For example, + +``` +> srun -n 2 ./bin/raja-perf.exe +``` +the entire Suite (all kernels and variants) will execute in their default +configurations on each of the 2 ranks. The kernel information output shows how +each kernel is run on each rank. The total problem size across all MPI ranks +can be calculated by multiplying the number of MPI ranks by the problem +size in the kernel information. Timing is reported on rank 0 and is gathered +by doing an MPI barrier, starting the timer, running the kernel repetitions, +doing an MPI barrier, and then stopping the timer. + ## Important note * The OpenMP target offload variants of the kernels in the Suite are a @@ -206,7 +252,7 @@ All output files are text files. Other than the checksum file, all are in ## Kernel information definitions -Information about kernels that are run is located in the ''RAJAPerf-kernels.csv'' file, which includes the following: +Information about kernels that are run is located in the ''RAJAPerf-kernels.csv'' file. This information is for each process individually, so when running with MPI the total problem size aggregated across all ranks is the number of ranks times the problem size shown in the kernel information. Kernel information includes the following: 1. Kernel name -- Format is group name followed by kernel name, separated by an underscore. 2. Feature -- RAJA feature(s) exercised in RAJA variants of kernel. @@ -219,6 +265,9 @@ Information about kernels that are run is located in the ''RAJAPerf-kernels.csv' ### Notes about 'problem size' + * Problem size is always ouput per process/MPI rank. To get the total problem + size across all ranks when running with MPI multiply the problem size by + the number of MPI ranks. * The Suite uses three notions of problem size for each kernel: 'default', 'target', and 'actual'. Default is the 'default' problem size defined for a kernel and the size that will be run if no runtime options are @@ -605,7 +654,7 @@ void Foo::runSeqVariant(VariantID vid) #endif // RUN_RAJA_SEQ default : { - std::cout << "\n : Unknown variant id = " << vid << std::endl; + getCout() << "\n : Unknown variant id = " << vid << std::endl; } } From a9504f178a2cbe08160abbecac18dbc5b807e8c7 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 15 Dec 2021 11:36:40 -0800 Subject: [PATCH 171/392] initial commit for improved threading --- src/apps/DIFFUSION3DPA-Seq.cpp | 376 +++++++++++++++++++++------------ src/apps/DIFFUSION3DPA.hpp | 144 ++++++++++++- 2 files changed, 388 insertions(+), 132 deletions(-) diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index c040bfc04..f2aa2a5e9 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -18,6 +18,7 @@ namespace rajaperf { namespace apps { +#define MFEM_SYNC_THREAD void DIFFUSION3DPA::runSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -35,57 +36,103 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { DIFFUSION3DPA_0_CPU; - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_1; - } - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_2; - } - } - - - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_3; - } - } - - CPU_FOREACH(qy, y, DPA_Q1D) { - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_4; - } - } - - CPU_FOREACH(qy, y, DPA_Q1D) { - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_5; - } - } - - CPU_FOREACH(d, y, DPA_D1D) { - CPU_FOREACH(q, x, DPA_Q1D) { - DIFFUSION3DPA_6; - } - } - - CPU_FOREACH(qy, y, DPA_Q1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_7; - } - } - - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_8; - } - } - - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_9; + CPU_FOREACH(dz,z,DPA_D1D) + { + CPU_FOREACH(dy,y,DPA_D1D) + { + CPU_FOREACH(dx,x,DPA_D1D) + { + NEW_DIFFUSION3DPA_1; + } } } + //if (MFEM_THREAD_ID(z) == 0) + //{ + CPU_FOREACH(dy,y,DPA_D1D) + { + CPU_FOREACH(qx,x,DPA_Q1D) + { + NEW_DIFFUSION3DPA_2; + } + } + //} + MFEM_SYNC_THREAD; + CPU_FOREACH(dz,z,DPA_D1D) + { + CPU_FOREACH(dy,y,DPA_D1D) + { + CPU_FOREACH(qx,x,DPA_Q1D) + { + NEW_DIFFUSION3DPA_3; + } + } + } + MFEM_SYNC_THREAD; + CPU_FOREACH(dz,z,DPA_D1D) + { + CPU_FOREACH(qy,y,DPA_Q1D) + { + CPU_FOREACH(qx,x,DPA_Q1D) + { + NEW_DIFFUSION3DPA_4; + } + } + } + MFEM_SYNC_THREAD; + CPU_FOREACH(qz,z,DPA_Q1D) + { + CPU_FOREACH(qy,y,DPA_Q1D) + { + CPU_FOREACH(qx,x,DPA_Q1D) + { + NEW_DIFFUSION3DPA_5; + } + } + } + MFEM_SYNC_THREAD; + //if (MFEM_THREAD_ID(z) == 0) + //{ + CPU_FOREACH(d,y,DPA_D1D) + { + CPU_FOREACH(q,x,DPA_Q1D) + { + NEW_DIFFUSION3DPA_6; + } + } + //} + MFEM_SYNC_THREAD; + CPU_FOREACH(qz,z,DPA_Q1D) + { + CPU_FOREACH(qy,y,DPA_Q1D) + { + CPU_FOREACH(dx,x,DPA_D1D) + { + NEW_DIFFUSION3DPA_7; + } + } + } + MFEM_SYNC_THREAD; + CPU_FOREACH(qz,z,DPA_Q1D) + { + CPU_FOREACH(dy,y,DPA_D1D) + { + CPU_FOREACH(dx,x,DPA_D1D) + { + NEW_DIFFUSION3DPA_8; + } + } + } + MFEM_SYNC_THREAD; + CPU_FOREACH(dz,z,DPA_D1D) + { + CPU_FOREACH(dy,y,DPA_D1D) + { + CPU_FOREACH(dx,x,DPA_D1D) + { + NEW_DIFFUSION3DPA_9; + } + } + } } // element loop } @@ -126,6 +173,13 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { #endif >; + using inner_z = RAJA::expt::LoopPolicy; + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -139,103 +193,165 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { DIFFUSION3DPA_0_CPU; - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dx) { - DIFFUSION3DPA_1; - } - ); // RAJA::expt::loop - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qx) { - DIFFUSION3DPA_2; - } - ); // RAJA::expt::loop - } // lambda (dy) - ); //RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { - ctx.teamSync(); + NEW_DIFFUSION3DPA_1; - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qx) { - DIFFUSION3DPA_3; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + } // lambda (dx) + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qx) { - DIFFUSION3DPA_4; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 0), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { - ctx.teamSync(); + NEW_DIFFUSION3DPA_2; - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qx) { - DIFFUSION3DPA_5; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + } // lambda (qx) + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int d) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int q) { - DIFFUSION3DPA_6; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + + NEW_DIFFUSION3DPA_3; + + } // lambda (qx) + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dx) { - DIFFUSION3DPA_7; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + + NEW_DIFFUSION3DPA_4; + + } // lambda (qx) + ); // RAJA::expt::loop + } // lambda (qy) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dx) { - DIFFUSION3DPA_8; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop - - ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dx) { - DIFFUSION3DPA_9; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + + NEW_DIFFUSION3DPA_5; + + } // lambda (qx) + ); // RAJA::expt::loop + } // lambda (qy) + ); //RAJA::expt::loop + } // lambda (qz) + ); //RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 0), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int d) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int q) { + + NEW_DIFFUSION3DPA_6; + + } // lambda (q) + ); // RAJA::expt::loop + } // lambda (d) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + + NEW_DIFFUSION3DPA_7; + + } // lambda (dx) + ); // RAJA::expt::loop + } // lambda (qy) + ); //RAJA::expt::loop + } // lambda (qz) + ); //RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + + NEW_DIFFUSION3DPA_8; + + } // lambda (dx) + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + } // lambda (qz) + ); //RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + + NEW_DIFFUSION3DPA_9; + + } // lambda (dx) + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop } // lambda (e) ); // RAJA::expt::loop diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index b7a2b1e1d..85b592f5f 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -364,7 +364,6 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1); \ double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); - #define DIFFUSION3DPA_0_CPU \ constexpr int MQ1 = DPA_Q1D; \ constexpr int MD1 = DPA_D1D; \ @@ -399,6 +398,9 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) s_X[dz][dy][dx] = dpaX_(dx,dy,dz,e); \ } +#define NEW_DIFFUSION3DPA_1 \ + s_X[dz][dy][dx] = dpaX_(dx,dy,dz,e); + #define DIFFUSION3DPA_2 \ const int i = qi(qx,dy,DPA_Q1D); \ const int j = dj(qx,dy,DPA_D1D); \ @@ -407,6 +409,14 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) B[i][j] = b(qx,dy); \ G[k][l] = g(qx,dy) * sign(qx,dy); +#define NEW_DIFFUSION3DPA_2 \ + const int i = qi(qx,dy,DPA_Q1D); \ + const int j = dj(qx,dy,DPA_D1D); \ + const int k = qk(qx,dy,DPA_Q1D); \ + const int l = dl(qx,dy,DPA_D1D); \ + B[i][j] = b(qx,dy); \ + G[k][l] = g(qx,dy) * sign(qx,dy); \ + #define DIFFUSION3DPA_3 \ double u[DPA_D1D], v[DPA_D1D]; \ RAJA_UNROLL(MD1) \ @@ -434,6 +444,24 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) DDQ1[dz][dy][qx] = v[dz]; \ } +#define NEW_DIFFUSION3DPA_3 \ + double u = 0.0, v = 0.0; \ + RAJA_UNROLL(MD1) \ + for (int dx = 0; dx < DPA_D1D; ++dx) \ + { \ + const int i = qi(qx,dx,DPA_Q1D); \ + const int j = dj(qx,dx,DPA_D1D); \ + const int k = qk(qx,dx,DPA_Q1D); \ + const int l = dl(qx,dx,DPA_D1D); \ + const double s = sign(qx,dx); \ + const double coords = s_X[dz][dy][dx]; \ + u += coords * B[i][j]; \ + v += coords * G[k][l] * s; \ + } \ + DDQ0[dz][dy][qx] = u; \ + DDQ1[dz][dy][qx] = v; + + #define DIFFUSION3DPA_4 \ double u[DPA_D1D], v[DPA_D1D], w[DPA_D1D]; \ RAJA_UNROLL(MD1) \ @@ -462,6 +490,24 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) DQQ2[dz][qy][qx] = w[dz]; \ } +#define NEW_DIFFUSION3DPA_4 \ + double u = 0.0, v = 0.0, w = 0.0; \ + RAJA_UNROLL(MD1) \ + for (int dy = 0; dy < DPA_D1D; ++dy) \ + { \ + const int i = qi(qy,dy,DPA_Q1D); \ + const int j = dj(qy,dy,DPA_D1D); \ + const int k = qk(qy,dy,DPA_Q1D); \ + const int l = dl(qy,dy,DPA_D1D); \ + const double s = sign(qy,dy); \ + u += DDQ1[dz][dy][qx] * B[i][j]; \ + v += DDQ0[dz][dy][qx] * G[k][l] * s; \ + w += DDQ0[dz][dy][qx] * B[i][j]; \ + } \ + DQQ0[dz][qy][qx] = u; \ + DQQ1[dz][qy][qx] = v; \ + DQQ2[dz][qy][qx] = w; + #define DIFFUSION3DPA_5 \ double u[DPA_Q1D], v[DPA_Q1D], w[DPA_Q1D]; \ RAJA_UNROLL(MQ1) \ @@ -502,6 +548,37 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) QQQ2[qz][qy][qx] = (O31*gX) + (O32*gY) + (O33*gZ); \ } +#define NEW_DIFFUSION3DPA_5 \ + double u = 0.0, v = 0.0, w = 0.0; \ + RAJA_UNROLL(MD1) \ + for (int dz = 0; dz < DPA_D1D; ++dz) \ + { \ + const int i = qi(qz,dz,DPA_Q1D); \ + const int j = dj(qz,dz,DPA_D1D); \ + const int k = qk(qz,dz,DPA_Q1D); \ + const int l = dl(qz,dz,DPA_D1D); \ + const double s = sign(qz,dz); \ + u += DQQ0[dz][qy][qx] * B[i][j]; \ + v += DQQ1[dz][qy][qx] * B[i][j]; \ + w += DQQ2[dz][qy][qx] * G[k][l] * s; \ + } \ + const double O11 = d(qx,qy,qz,0,e); \ + const double O12 = d(qx,qy,qz,1,e); \ + const double O13 = d(qx,qy,qz,2,e); \ + const double O21 = symmetric ? O12 : d(qx,qy,qz,3,e); \ + const double O22 = symmetric ? d(qx,qy,qz,3,e) : d(qx,qy,qz,4,e); \ + const double O23 = symmetric ? d(qx,qy,qz,4,e) : d(qx,qy,qz,5,e); \ + const double O31 = symmetric ? O13 : d(qx,qy,qz,6,e); \ + const double O32 = symmetric ? O23 : d(qx,qy,qz,7,e); \ + const double O33 = symmetric ? d(qx,qy,qz,5,e) : d(qx,qy,qz,8,e); \ + const double gX = u; \ + const double gY = v; \ + const double gZ = w; \ + QQQ0[qz][qy][qx] = (O11*gX) + (O12*gY) + (O13*gZ); \ + QQQ1[qz][qy][qx] = (O21*gX) + (O22*gY) + (O23*gZ); \ + QQQ2[qz][qy][qx] = (O31*gX) + (O32*gY) + (O33*gZ); + + #define DIFFUSION3DPA_6 \ const int i = qi(q,d,DPA_Q1D); \ const int j = dj(q,d,DPA_D1D); \ @@ -510,6 +587,15 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) Bt[j][i] = b(q,d); \ Gt[l][k] = g(q,d) * sign(q,d); + +#define NEW_DIFFUSION3DPA_6 \ + const int i = qi(q,d,DPA_Q1D); \ + const int j = dj(q,d,DPA_D1D); \ + const int k = qk(q,d,DPA_Q1D); \ + const int l = dl(q,d,DPA_D1D); \ + Bt[j][i] = b(q,d); \ + Gt[l][k] = g(q,d) * sign(q,d); + #define DIFFUSION3DPA_7 \ double u[DPA_Q1D], v[DPA_Q1D], w[DPA_Q1D]; \ RAJA_UNROLL(MQ1) \ @@ -538,6 +624,24 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) QQD2[qz][qy][dx] = w[qz]; \ } +#define NEW_DIFFUSION3DPA_7 \ + double u = 0.0, v = 0.0, w = 0.0; \ + RAJA_UNROLL(MQ1) \ + for (int qx = 0; qx < DPA_Q1D; ++qx) \ + { \ + const int i = qi(qx,dx,DPA_Q1D); \ + const int j = dj(qx,dx,DPA_D1D); \ + const int k = qk(qx,dx,DPA_Q1D); \ + const int l = dl(qx,dx,DPA_D1D); \ + const double s = sign(qx,dx); \ + u += QQQ0[qz][qy][qx] * Gt[l][k] * s; \ + v += QQQ1[qz][qy][qx] * Bt[j][i]; \ + w += QQQ2[qz][qy][qx] * Bt[j][i]; \ + } \ + QQD0[qz][qy][dx] = u; \ + QQD1[qz][qy][dx] = v; \ + QQD2[qz][qy][dx] = w; + #define DIFFUSION3DPA_8 \ double u[DPA_Q1D], v[DPA_Q1D], w[DPA_Q1D]; \ RAJA_UNROLL(MQ1) \ @@ -564,7 +668,25 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) QDD0[qz][dy][dx] = u[qz]; \ QDD1[qz][dy][dx] = v[qz]; \ QDD2[qz][dy][dx] = w[qz]; \ - } \ + } + +#define NEW_DIFFUSION3DPA_8 \ + double u = 0.0, v = 0.0, w = 0.0; \ + RAJA_UNROLL(DPA_Q1D) \ + for (int qy = 0; qy < DPA_Q1D; ++qy) \ + { \ + const int i = qi(qy,dy,DPA_Q1D); \ + const int j = dj(qy,dy,DPA_D1D); \ + const int k = qk(qy,dy,DPA_Q1D); \ + const int l = dl(qy,dy,DPA_D1D); \ + const double s = sign(qy,dy); \ + u += QQD0[qz][qy][dx] * Bt[j][i]; \ + v += QQD1[qz][qy][dx] * Gt[l][k] * s; \ + w += QQD2[qz][qy][dx] * Bt[j][i]; \ + } \ + QDD0[qz][dy][dx] = u; \ + QDD1[qz][dy][dx] = v; \ + QDD2[qz][dy][dx] = w; #define DIFFUSION3DPA_9 \ double u[DPA_D1D], v[DPA_D1D], w[DPA_D1D]; \ @@ -592,11 +714,28 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) dpaY_(dx,dy,dz,e) += (u[dz] + v[dz] + w[dz]); \ } +#define NEW_DIFFUSION3DPA_9 \ + double u = 0.0, v = 0.0, w = 0.0; \ + RAJA_UNROLL(MQ1) \ + for (int qz = 0; qz < DPA_Q1D; ++qz) \ + { \ + const int i = qi(qz,dz,DPA_Q1D); \ + const int j = dj(qz,dz,DPA_D1D); \ + const int k = qk(qz,dz,DPA_Q1D); \ + const int l = dl(qz,dz,DPA_D1D); \ + const double s = sign(qz,dz); \ + u += QDD0[qz][dy][dx] * Bt[j][i]; \ + v += QDD1[qz][dy][dx] * Bt[j][i]; \ + w += QDD2[qz][dy][dx] * Gt[l][k] * s; \ + } \ + dpaY_(dx,dy,dz,e) += (u + v + w); + #if defined(RAJA_ENABLE_CUDA) using d3d_device_launch = RAJA::expt::cuda_launch_t; using d3d_gpu_block_x_policy = RAJA::cuda_block_x_direct; using d3d_gpu_thread_x_policy = RAJA::cuda_thread_x_loop; using d3d_gpu_thread_y_policy = RAJA::cuda_thread_y_loop; + using d3d_gpu_thread_z_policy = RAJA::cuda_thread_z_loop; #endif #if defined(RAJA_ENABLE_HIP) @@ -604,6 +743,7 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) using d3d_gpu_block_x_policy = RAJA::hip_block_x_direct; using d3d_gpu_thread_x_policy = RAJA::hip_thread_x_loop; using d3d_gpu_thread_y_policy = RAJA::hip_thread_y_loop; + using d3d_gpu_thread_z_policy = RAJA::hip_thread_z_loop; #endif namespace rajaperf From d4f9ff97449168256e2cb7c4ea55552717f86dea Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 15 Dec 2021 11:59:00 -0800 Subject: [PATCH 172/392] formatting omp version --- src/apps/DIFFUSION3DPA-OMP.cpp | 375 +++++++++++++++++++++------------ src/apps/DIFFUSION3DPA-Seq.cpp | 116 +++++----- 2 files changed, 303 insertions(+), 188 deletions(-) diff --git a/src/apps/DIFFUSION3DPA-OMP.cpp b/src/apps/DIFFUSION3DPA-OMP.cpp index 05eec0eb6..b0398bf67 100644 --- a/src/apps/DIFFUSION3DPA-OMP.cpp +++ b/src/apps/DIFFUSION3DPA-OMP.cpp @@ -18,6 +18,7 @@ namespace rajaperf { namespace apps { +#define MFEM_SYNC_THREAD void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { @@ -38,57 +39,103 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { DIFFUSION3DPA_0_CPU; - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_1; - } - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_2; - } - } - - - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_3; - } - } - - CPU_FOREACH(qy, y, DPA_Q1D) { - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_4; - } - } - - CPU_FOREACH(qy, y, DPA_Q1D) { - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_5; - } - } - - CPU_FOREACH(d, y, DPA_D1D) { - CPU_FOREACH(q, x, DPA_Q1D) { - DIFFUSION3DPA_6; - } - } - - CPU_FOREACH(qy, y, DPA_Q1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_7; - } - } - - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_8; - } - } - - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_9; + CPU_FOREACH(dz,z,DPA_D1D) + { + CPU_FOREACH(dy,y,DPA_D1D) + { + CPU_FOREACH(dx,x,DPA_D1D) + { + NEW_DIFFUSION3DPA_1; + } } } + //if (MFEM_THREAD_ID(z) == 0) + //{ + CPU_FOREACH(dy,y,DPA_D1D) + { + CPU_FOREACH(qx,x,DPA_Q1D) + { + NEW_DIFFUSION3DPA_2; + } + } + //} + MFEM_SYNC_THREAD; + CPU_FOREACH(dz,z,DPA_D1D) + { + CPU_FOREACH(dy,y,DPA_D1D) + { + CPU_FOREACH(qx,x,DPA_Q1D) + { + NEW_DIFFUSION3DPA_3; + } + } + } + MFEM_SYNC_THREAD; + CPU_FOREACH(dz,z,DPA_D1D) + { + CPU_FOREACH(qy,y,DPA_Q1D) + { + CPU_FOREACH(qx,x,DPA_Q1D) + { + NEW_DIFFUSION3DPA_4; + } + } + } + MFEM_SYNC_THREAD; + CPU_FOREACH(qz,z,DPA_Q1D) + { + CPU_FOREACH(qy,y,DPA_Q1D) + { + CPU_FOREACH(qx,x,DPA_Q1D) + { + NEW_DIFFUSION3DPA_5; + } + } + } + MFEM_SYNC_THREAD; + //if (MFEM_THREAD_ID(z) == 0) + //{ + CPU_FOREACH(d,y,DPA_D1D) + { + CPU_FOREACH(q,x,DPA_Q1D) + { + NEW_DIFFUSION3DPA_6; + } + } + //} + MFEM_SYNC_THREAD; + CPU_FOREACH(qz,z,DPA_Q1D) + { + CPU_FOREACH(qy,y,DPA_Q1D) + { + CPU_FOREACH(dx,x,DPA_D1D) + { + NEW_DIFFUSION3DPA_7; + } + } + } + MFEM_SYNC_THREAD; + CPU_FOREACH(qz,z,DPA_Q1D) + { + CPU_FOREACH(dy,y,DPA_D1D) + { + CPU_FOREACH(dx,x,DPA_D1D) + { + NEW_DIFFUSION3DPA_8; + } + } + } + MFEM_SYNC_THREAD; + CPU_FOREACH(dz,z,DPA_D1D) + { + CPU_FOREACH(dy,y,DPA_D1D) + { + CPU_FOREACH(dx,x,DPA_D1D) + { + NEW_DIFFUSION3DPA_9; + } + } + } } // element loop } @@ -124,6 +171,12 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { #endif >; + using inner_z = RAJA::expt::LoopPolicy; + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -137,103 +190,165 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { DIFFUSION3DPA_0_CPU; - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dx) { - DIFFUSION3DPA_1; - } - ); // RAJA::expt::loop - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qx) { - DIFFUSION3DPA_2; - } - ); // RAJA::expt::loop - } // lambda (dy) - ); //RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { - ctx.teamSync(); + NEW_DIFFUSION3DPA_1; - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qx) { - DIFFUSION3DPA_3; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + } // lambda (dx) + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop - ctx.teamSync(); + ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qx) { - DIFFUSION3DPA_4; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 0), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { - ctx.teamSync(); + NEW_DIFFUSION3DPA_2; + + } // lambda (qx) + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + + NEW_DIFFUSION3DPA_3; + + } // lambda (qx) + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop + + ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qx) { - DIFFUSION3DPA_5; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + + NEW_DIFFUSION3DPA_4; + + } // lambda (qx) + ); // RAJA::expt::loop + } // lambda (qy) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int d) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int q) { - DIFFUSION3DPA_6; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + + NEW_DIFFUSION3DPA_5; + + } // lambda (qx) + ); // RAJA::expt::loop + } // lambda (qy) + ); //RAJA::expt::loop + } // lambda (qz) + ); //RAJA::expt::loop ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dx) { - DIFFUSION3DPA_7; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 0), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int d) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int q) { + + NEW_DIFFUSION3DPA_6; + + } // lambda (q) + ); // RAJA::expt::loop + } // lambda (d) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + + NEW_DIFFUSION3DPA_7; + + } // lambda (dx) + ); // RAJA::expt::loop + } // lambda (qy) + ); //RAJA::expt::loop + } // lambda (qz) + ); //RAJA::expt::loop ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dx) { - DIFFUSION3DPA_8; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { - ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dx) { - DIFFUSION3DPA_9; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + NEW_DIFFUSION3DPA_8; + + } // lambda (dx) + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + } // lambda (qz) + ); //RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + + NEW_DIFFUSION3DPA_9; + + } // lambda (dx) + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop } // lambda (e) ); // RAJA::expt::loop diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index f2aa2a5e9..8db5cf796 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -207,61 +207,61 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { } // lambda (dy) ); //RAJA::expt::loop } // lambda (dz) - ); //RAJA::expt::loop + ); //RAJA::expt::loop - ctx.teamSync(); + ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 0), - [&](int dz) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qx) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 0), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { NEW_DIFFUSION3DPA_2; - } // lambda (qx) - ); // RAJA::expt::loop - } // lambda (dy) - ); //RAJA::expt::loop - } // lambda (dz) - ); //RAJA::expt::loop + } // lambda (qx) + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop - ctx.teamSync(); + ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dz) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qx) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { NEW_DIFFUSION3DPA_3; - } // lambda (qx) - ); // RAJA::expt::loop - } // lambda (dy) - ); //RAJA::expt::loop - } // lambda (dz) - ); //RAJA::expt::loop + } // lambda (qx) + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop - ctx.teamSync(); + ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dz) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qx) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { NEW_DIFFUSION3DPA_4; - } // lambda (qx) - ); // RAJA::expt::loop - } // lambda (qy) - ); //RAJA::expt::loop - } // lambda (dz) - ); //RAJA::expt::loop + } // lambda (qx) + ); // RAJA::expt::loop + } // lambda (qy) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop ctx.teamSync(); @@ -272,86 +272,86 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qx) { - NEW_DIFFUSION3DPA_5; + NEW_DIFFUSION3DPA_5; } // lambda (qx) ); // RAJA::expt::loop } // lambda (qy) ); //RAJA::expt::loop } // lambda (qz) - ); //RAJA::expt::loop + ); //RAJA::expt::loop - ctx.teamSync(); + ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 0), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 0), [&](int dz) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int d) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int q) { - NEW_DIFFUSION3DPA_6; + NEW_DIFFUSION3DPA_6; } // lambda (q) ); // RAJA::expt::loop } // lambda (d) ); //RAJA::expt::loop } // lambda (dz) - ); //RAJA::expt::loop + ); //RAJA::expt::loop - ctx.teamSync(); + ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qz) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qy) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { - NEW_DIFFUSION3DPA_7; + NEW_DIFFUSION3DPA_7; } // lambda (dx) ); // RAJA::expt::loop } // lambda (qy) ); //RAJA::expt::loop } // lambda (qz) - ); //RAJA::expt::loop + ); //RAJA::expt::loop - ctx.teamSync(); + ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qz) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dy) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { - NEW_DIFFUSION3DPA_8; + NEW_DIFFUSION3DPA_8; } // lambda (dx) ); // RAJA::expt::loop } // lambda (dy) ); //RAJA::expt::loop } // lambda (qz) - ); //RAJA::expt::loop + ); //RAJA::expt::loop - ctx.teamSync(); + ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dz) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dy) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { - NEW_DIFFUSION3DPA_9; + NEW_DIFFUSION3DPA_9; } // lambda (dx) ); // RAJA::expt::loop } // lambda (dy) ); //RAJA::expt::loop } // lambda (dz) - ); //RAJA::expt::loop + ); //RAJA::expt::loop } // lambda (e) ); // RAJA::expt::loop From 4c0f8fd5c5cde8fb07aaedfeb68cbe39066efef1 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 15 Dec 2021 13:52:33 -0800 Subject: [PATCH 173/392] about to apply clang format --- src/apps/DIFFUSION3DPA-Cuda.cpp | 350 +++++++++++++++++++++----------- src/apps/DIFFUSION3DPA-Hip.cpp | 350 +++++++++++++++++++++----------- src/apps/DIFFUSION3DPA-Seq.cpp | 4 +- 3 files changed, 464 insertions(+), 240 deletions(-) diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index e51e77636..2b2ddc514 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -44,58 +44,104 @@ __global__ void Diffusion3DPA(Index_type NE, const Real_ptr Basis, const Real_pt DIFFUSION3DPA_0_GPU; - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { - DIFFUSION3DPA_1; - } - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { - DIFFUSION3DPA_2; - } + GPU_FOREACH_THREAD(dz,z,DPA_D1D) + { + GPU_FOREACH_THREAD(dy,y,DPA_D1D) + { + GPU_FOREACH_THREAD(dx,x,DPA_D1D) + { + NEW_DIFFUSION3DPA_1; + } + } } - __syncthreads(); - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { - DIFFUSION3DPA_3; - } + if (threadIdx.z == 0) + { + GPU_FOREACH_THREAD(dy,y,DPA_D1D) + { + GPU_FOREACH_THREAD(qx,x,DPA_Q1D) + { + NEW_DIFFUSION3DPA_2; + } + } } __syncthreads(); - GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { - DIFFUSION3DPA_4; - } + GPU_FOREACH_THREAD(dz,z,DPA_D1D) + { + GPU_FOREACH_THREAD(dy,y,DPA_D1D) + { + GPU_FOREACH_THREAD(qx,x,DPA_Q1D) + { + NEW_DIFFUSION3DPA_3; + } + } } __syncthreads(); - GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { - DIFFUSION3DPA_5; + GPU_FOREACH_THREAD(dz,z,DPA_D1D) + { + GPU_FOREACH_THREAD(qy,y,DPA_Q1D) + { + GPU_FOREACH_THREAD(qx,x,DPA_Q1D) + { + NEW_DIFFUSION3DPA_4; + } } } __syncthreads(); - GPU_FOREACH_THREAD(d, y, DPA_D1D) { - GPU_FOREACH_THREAD(q, x, DPA_Q1D) { - DIFFUSION3DPA_6; + GPU_FOREACH_THREAD(qz,z,DPA_Q1D) + { + GPU_FOREACH_THREAD(qy,y,DPA_Q1D) + { + GPU_FOREACH_THREAD(qx,x,DPA_Q1D) + { + NEW_DIFFUSION3DPA_5; + } } } __syncthreads(); - GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { - DIFFUSION3DPA_7; - } + if (threadIdx.z == 0) + { + GPU_FOREACH_THREAD(d,y,DPA_D1D) + { + GPU_FOREACH_THREAD(q,x,DPA_Q1D) + { + NEW_DIFFUSION3DPA_6; + } + } } __syncthreads(); - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { - DIFFUSION3DPA_8; - } + GPU_FOREACH_THREAD(qz,z,DPA_Q1D) + { + GPU_FOREACH_THREAD(qy,y,DPA_Q1D) + { + GPU_FOREACH_THREAD(dx,x,DPA_D1D) + { + NEW_DIFFUSION3DPA_7; + } + } } __syncthreads(); - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { - DIFFUSION3DPA_9; - } - } - + GPU_FOREACH_THREAD(qz,z,DPA_Q1D) + { + GPU_FOREACH_THREAD(dy,y,DPA_D1D) + { + GPU_FOREACH_THREAD(dx,x,DPA_D1D) + { + NEW_DIFFUSION3DPA_8; + } + } + } + __syncthreads(); + GPU_FOREACH_THREAD(dz,z,DPA_D1D) + { + GPU_FOREACH_THREAD(dy,y,DPA_D1D) + { + GPU_FOREACH_THREAD(dx,x,DPA_D1D) + { + NEW_DIFFUSION3DPA_9; + } + } + } } void DIFFUSION3DPA::runCudaVariant(VariantID vid) { @@ -112,7 +158,7 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, 1); + dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, DPA_Q1D); Diffusion3DPA<<>>(NE, Basis, dBasis, D, X, Y, symmetric); @@ -145,13 +191,17 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { ,RAJA::cuda_thread_y_loop >; + using inner_z = RAJA::expt::LoopPolicy; + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { RAJA::expt::launch( RAJA::expt::DEVICE, RAJA::expt::Grid(RAJA::expt::Teams(NE), - RAJA::expt::Threads(DPA_Q1D, DPA_Q1D, 1)), + RAJA::expt::Threads(DPA_Q1D, DPA_Q1D, DPA_Q1D)), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), @@ -159,103 +209,165 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { DIFFUSION3DPA_0_GPU; - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dx) { - DIFFUSION3DPA_1; - } - ); // RAJA::expt::loop - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qx) { - DIFFUSION3DPA_2; - } - ); // RAJA::expt::loop - } // lambda (dy) - ); //RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { - ctx.teamSync(); + NEW_DIFFUSION3DPA_1; - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qx) { - DIFFUSION3DPA_3; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + } // lambda (dx) + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop - ctx.teamSync(); + ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qx) { - DIFFUSION3DPA_4; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 0), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { - ctx.teamSync(); + NEW_DIFFUSION3DPA_2; + + } // lambda (qx) + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop + + ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qx) { - DIFFUSION3DPA_5; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + + NEW_DIFFUSION3DPA_3; + + } // lambda (qx) + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + + NEW_DIFFUSION3DPA_4; + + } // lambda (qx) + ); // RAJA::expt::loop + } // lambda (qy) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int d) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int q) { - DIFFUSION3DPA_6; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + + NEW_DIFFUSION3DPA_5; + + } // lambda (qx) + ); // RAJA::expt::loop + } // lambda (qy) + ); //RAJA::expt::loop + } // lambda (qz) + ); //RAJA::expt::loop ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dx) { - DIFFUSION3DPA_7; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 0), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int d) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int q) { + + NEW_DIFFUSION3DPA_6; + + } // lambda (q) + ); // RAJA::expt::loop + } // lambda (d) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + + NEW_DIFFUSION3DPA_7; + + } // lambda (dx) + ); // RAJA::expt::loop + } // lambda (qy) + ); //RAJA::expt::loop + } // lambda (qz) + ); //RAJA::expt::loop ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dx) { - DIFFUSION3DPA_8; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { - ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dx) { - DIFFUSION3DPA_9; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + NEW_DIFFUSION3DPA_8; + + } // lambda (dx) + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + } // lambda (qz) + ); //RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + + NEW_DIFFUSION3DPA_9; + + } // lambda (dx) + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop } // lambda (e) ); // RAJA::expt::loop diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index fc4e2183b..d3919fa11 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -44,58 +44,104 @@ __global__ void Diffusion3DPA(Index_type NE, const Real_ptr Basis, const Real_pt DIFFUSION3DPA_0_GPU; - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { - DIFFUSION3DPA_1; - } - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { - DIFFUSION3DPA_2; - } + GPU_FOREACH_THREAD(dz,z,DPA_D1D) + { + GPU_FOREACH_THREAD(dy,y,DPA_D1D) + { + GPU_FOREACH_THREAD(dx,x,DPA_D1D) + { + NEW_DIFFUSION3DPA_1; + } + } } - __syncthreads(); - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { - DIFFUSION3DPA_3; - } + if (threadIdx.z == 0) + { + GPU_FOREACH_THREAD(dy,y,DPA_D1D) + { + GPU_FOREACH_THREAD(qx,x,DPA_Q1D) + { + NEW_DIFFUSION3DPA_2; + } + } } __syncthreads(); - GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { - DIFFUSION3DPA_4; - } + GPU_FOREACH_THREAD(dz,z,DPA_D1D) + { + GPU_FOREACH_THREAD(dy,y,DPA_D1D) + { + GPU_FOREACH_THREAD(qx,x,DPA_Q1D) + { + NEW_DIFFUSION3DPA_3; + } + } } __syncthreads(); - GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { - DIFFUSION3DPA_5; + GPU_FOREACH_THREAD(dz,z,DPA_D1D) + { + GPU_FOREACH_THREAD(qy,y,DPA_Q1D) + { + GPU_FOREACH_THREAD(qx,x,DPA_Q1D) + { + NEW_DIFFUSION3DPA_4; + } } } __syncthreads(); - GPU_FOREACH_THREAD(d, y, DPA_D1D) { - GPU_FOREACH_THREAD(q, x, DPA_Q1D) { - DIFFUSION3DPA_6; + GPU_FOREACH_THREAD(qz,z,DPA_Q1D) + { + GPU_FOREACH_THREAD(qy,y,DPA_Q1D) + { + GPU_FOREACH_THREAD(qx,x,DPA_Q1D) + { + NEW_DIFFUSION3DPA_5; + } } } __syncthreads(); - GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { - DIFFUSION3DPA_7; - } + if (threadIdx.z == 0) + { + GPU_FOREACH_THREAD(d,y,DPA_D1D) + { + GPU_FOREACH_THREAD(q,x,DPA_Q1D) + { + NEW_DIFFUSION3DPA_6; + } + } } __syncthreads(); - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { - DIFFUSION3DPA_8; - } + GPU_FOREACH_THREAD(qz,z,DPA_Q1D) + { + GPU_FOREACH_THREAD(qy,y,DPA_Q1D) + { + GPU_FOREACH_THREAD(dx,x,DPA_D1D) + { + NEW_DIFFUSION3DPA_7; + } + } } __syncthreads(); - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { - DIFFUSION3DPA_9; - } - } - + GPU_FOREACH_THREAD(qz,z,DPA_Q1D) + { + GPU_FOREACH_THREAD(dy,y,DPA_D1D) + { + GPU_FOREACH_THREAD(dx,x,DPA_D1D) + { + NEW_DIFFUSION3DPA_8; + } + } + } + __syncthreads(); + GPU_FOREACH_THREAD(dz,z,DPA_D1D) + { + GPU_FOREACH_THREAD(dy,y,DPA_D1D) + { + GPU_FOREACH_THREAD(dx,x,DPA_D1D) + { + NEW_DIFFUSION3DPA_9; + } + } + } } void DIFFUSION3DPA::runHipVariant(VariantID vid) { @@ -110,7 +156,7 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { DIFFUSION3DPA_DATA_SETUP_HIP; dim3 grid_size(NE); - dim3 block_size(DPA_Q1D, DPA_Q1D, 1); + dim3 block_size(DPA_Q1D, DPA_Q1D, DPA_Q1D); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -148,13 +194,17 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { ,RAJA::hip_thread_y_loop >; + using inner_y = RAJA::expt::LoopPolicy; + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { RAJA::expt::launch( RAJA::expt::DEVICE, RAJA::expt::Grid(RAJA::expt::Teams(NE), - RAJA::expt::Threads(DPA_Q1D, DPA_Q1D, 1)), + RAJA::expt::Threads(DPA_Q1D, DPA_Q1D, DPA_Q1D)), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), @@ -162,103 +212,165 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { DIFFUSION3DPA_0_GPU; - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dx) { - DIFFUSION3DPA_1; - } - ); // RAJA::expt::loop - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qx) { - DIFFUSION3DPA_2; - } - ); // RAJA::expt::loop - } // lambda (dy) - ); //RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { - ctx.teamSync(); + NEW_DIFFUSION3DPA_1; - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qx) { - DIFFUSION3DPA_3; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + } // lambda (dx) + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop - ctx.teamSync(); + ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qx) { - DIFFUSION3DPA_4; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 0), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { - ctx.teamSync(); + NEW_DIFFUSION3DPA_2; + + } // lambda (qx) + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + + NEW_DIFFUSION3DPA_3; + + } // lambda (qx) + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + + NEW_DIFFUSION3DPA_4; - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qx) { - DIFFUSION3DPA_5; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + } // lambda (qx) + ); // RAJA::expt::loop + } // lambda (qy) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int d) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int q) { - DIFFUSION3DPA_6; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + + NEW_DIFFUSION3DPA_5; + + } // lambda (qx) + ); // RAJA::expt::loop + } // lambda (qy) + ); //RAJA::expt::loop + } // lambda (qz) + ); //RAJA::expt::loop ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](int qy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dx) { - DIFFUSION3DPA_7; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 0), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int d) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int q) { + + NEW_DIFFUSION3DPA_6; + + } // lambda (q) + ); // RAJA::expt::loop + } // lambda (d) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + + NEW_DIFFUSION3DPA_7; + + } // lambda (dx) + ); // RAJA::expt::loop + } // lambda (qy) + ); //RAJA::expt::loop + } // lambda (qz) + ); //RAJA::expt::loop ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dx) { - DIFFUSION3DPA_8; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { - ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dy) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](int dx) { - DIFFUSION3DPA_9; - } - ); // RAJA::expt::loop - } - ); // RAJA::expt::loop + NEW_DIFFUSION3DPA_8; + + } // lambda (dx) + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + } // lambda (qz) + ); //RAJA::expt::loop + + ctx.teamSync(); + + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + + NEW_DIFFUSION3DPA_9; + + } // lambda (dx) + ); // RAJA::expt::loop + } // lambda (dy) + ); //RAJA::expt::loop + } // lambda (dz) + ); //RAJA::expt::loop } // lambda (e) ); // RAJA::expt::loop diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index 8db5cf796..2f5ecfb10 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -211,7 +211,7 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 0), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 1), [&](int dz) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dy) { @@ -283,7 +283,7 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 0), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 1), [&](int dz) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int d) { From 5c0d7734020bfaa0d3bfae0eedfeca2a1615669b Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 15 Dec 2021 15:14:19 -0800 Subject: [PATCH 174/392] faster Diffusion kernels based on Tom Stitt's work --- src/apps/DIFFUSION3DPA-Cuda.cpp | 222 ++++++++---------- src/apps/DIFFUSION3DPA-Hip.cpp | 222 ++++++++---------- src/apps/DIFFUSION3DPA-OMP.cpp | 207 ++++++++--------- src/apps/DIFFUSION3DPA-OMPTarget.cpp | 4 +- src/apps/DIFFUSION3DPA-Seq.cpp | 180 ++++++--------- src/apps/DIFFUSION3DPA.hpp | 328 +++++++++++---------------- 6 files changed, 488 insertions(+), 675 deletions(-) diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index 2b2ddc514..fd4872bbc 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -22,126 +22,100 @@ namespace rajaperf { namespace apps { -#define DIFFUSION3DPA_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(Basis, m_B, DPA_Q1D *DPA_D1D); \ - allocAndInitCudaDeviceData(dBasis, m_G, DPA_Q1D *DPA_D1D); \ - allocAndInitCudaDeviceData(D, m_D, DPA_Q1D *DPA_Q1D *DPA_Q1D *SYM *m_NE); \ - allocAndInitCudaDeviceData(X, m_X, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ +#define DIFFUSION3DPA_DATA_SETUP_CUDA \ + allocAndInitCudaDeviceData(Basis, m_B, DPA_Q1D *DPA_D1D); \ + allocAndInitCudaDeviceData(dBasis, m_G, DPA_Q1D *DPA_D1D); \ + allocAndInitCudaDeviceData(D, m_D, DPA_Q1D *DPA_Q1D *DPA_Q1D *SYM *m_NE); \ + allocAndInitCudaDeviceData(X, m_X, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ allocAndInitCudaDeviceData(Y, m_Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); -#define DIFFUSION3DPA_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_Y, Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ - deallocCudaDeviceData(Basis); \ - deallocCudaDeviceData(dBasis); \ - deallocCudaDeviceData(D); \ - deallocCudaDeviceData(X); \ +#define DIFFUSION3DPA_DATA_TEARDOWN_CUDA \ + getCudaDeviceData(m_Y, Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ + deallocCudaDeviceData(Basis); \ + deallocCudaDeviceData(dBasis); \ + deallocCudaDeviceData(D); \ + deallocCudaDeviceData(X); \ deallocCudaDeviceData(Y); -__global__ void Diffusion3DPA(Index_type NE, const Real_ptr Basis, const Real_ptr dBasis, - const Real_ptr D, const Real_ptr X, Real_ptr Y, bool symmetric) { +__global__ void Diffusion3DPA(Index_type NE, const Real_ptr Basis, + const Real_ptr dBasis, const Real_ptr D, + const Real_ptr X, Real_ptr Y, bool symmetric) { const int e = blockIdx.x; DIFFUSION3DPA_0_GPU; - GPU_FOREACH_THREAD(dz,z,DPA_D1D) - { - GPU_FOREACH_THREAD(dy,y,DPA_D1D) - { - GPU_FOREACH_THREAD(dx,x,DPA_D1D) - { - NEW_DIFFUSION3DPA_1; - } + GPU_FOREACH_THREAD(dz, z, DPA_D1D) { + GPU_FOREACH_THREAD(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + DIFFUSION3DPA_1; } + } } - if (threadIdx.z == 0) - { - GPU_FOREACH_THREAD(dy,y,DPA_D1D) - { - GPU_FOREACH_THREAD(qx,x,DPA_Q1D) - { - NEW_DIFFUSION3DPA_2; - } + if (threadIdx.z == 0) { + GPU_FOREACH_THREAD(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + DIFFUSION3DPA_2; } + } } __syncthreads(); - GPU_FOREACH_THREAD(dz,z,DPA_D1D) - { - GPU_FOREACH_THREAD(dy,y,DPA_D1D) - { - GPU_FOREACH_THREAD(qx,x,DPA_Q1D) - { - NEW_DIFFUSION3DPA_3; - } + GPU_FOREACH_THREAD(dz, z, DPA_D1D) { + GPU_FOREACH_THREAD(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + DIFFUSION3DPA_3; } + } } __syncthreads(); - GPU_FOREACH_THREAD(dz,z,DPA_D1D) - { - GPU_FOREACH_THREAD(qy,y,DPA_Q1D) - { - GPU_FOREACH_THREAD(qx,x,DPA_Q1D) - { - NEW_DIFFUSION3DPA_4; + GPU_FOREACH_THREAD(dz, z, DPA_D1D) { + GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { + GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + DIFFUSION3DPA_4; } } } __syncthreads(); - GPU_FOREACH_THREAD(qz,z,DPA_Q1D) - { - GPU_FOREACH_THREAD(qy,y,DPA_Q1D) - { - GPU_FOREACH_THREAD(qx,x,DPA_Q1D) - { - NEW_DIFFUSION3DPA_5; - } + GPU_FOREACH_THREAD(qz, z, DPA_Q1D) { + GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { + GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + DIFFUSION3DPA_5; + } } } __syncthreads(); - if (threadIdx.z == 0) - { - GPU_FOREACH_THREAD(d,y,DPA_D1D) - { - GPU_FOREACH_THREAD(q,x,DPA_Q1D) - { - NEW_DIFFUSION3DPA_6; - } + if (threadIdx.z == 0) { + GPU_FOREACH_THREAD(d, y, DPA_D1D) { + GPU_FOREACH_THREAD(q, x, DPA_Q1D) { + DIFFUSION3DPA_6; } + } } __syncthreads(); - GPU_FOREACH_THREAD(qz,z,DPA_Q1D) - { - GPU_FOREACH_THREAD(qy,y,DPA_Q1D) - { - GPU_FOREACH_THREAD(dx,x,DPA_D1D) - { - NEW_DIFFUSION3DPA_7; - } + GPU_FOREACH_THREAD(qz, z, DPA_Q1D) { + GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { + GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + DIFFUSION3DPA_7; } + } } __syncthreads(); - GPU_FOREACH_THREAD(qz,z,DPA_Q1D) - { - GPU_FOREACH_THREAD(dy,y,DPA_D1D) - { - GPU_FOREACH_THREAD(dx,x,DPA_D1D) - { - NEW_DIFFUSION3DPA_8; - } - } + GPU_FOREACH_THREAD(qz, z, DPA_Q1D) { + GPU_FOREACH_THREAD(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + DIFFUSION3DPA_8; } - __syncthreads(); - GPU_FOREACH_THREAD(dz,z,DPA_D1D) - { - GPU_FOREACH_THREAD(dy,y,DPA_D1D) - { - GPU_FOREACH_THREAD(dx,x,DPA_D1D) - { - NEW_DIFFUSION3DPA_9; - } - } + } + } + __syncthreads(); + GPU_FOREACH_THREAD(dz, z, DPA_D1D) { + GPU_FOREACH_THREAD(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + DIFFUSION3DPA_9; } + } + } } void DIFFUSION3DPA::runCudaVariant(VariantID vid) { @@ -160,9 +134,10 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, DPA_Q1D); - Diffusion3DPA<<>>(NE, Basis, dBasis, D, X, Y, symmetric); + Diffusion3DPA<<>>(NE, Basis, dBasis, D, X, Y, + symmetric); - cudaErrchk( cudaGetLastError() ); + cudaErrchk(cudaGetLastError()); } stopTimer(); @@ -175,39 +150,35 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { DIFFUSION3DPA_DATA_SETUP_CUDA; - using launch_policy = RAJA::expt::LaunchPolicy - >; + using launch_policy = + RAJA::expt::LaunchPolicy>; - using outer_x = RAJA::expt::LoopPolicy; + using outer_x = + RAJA::expt::LoopPolicy; - using inner_x = RAJA::expt::LoopPolicy; + using inner_x = + RAJA::expt::LoopPolicy; - using inner_y = RAJA::expt::LoopPolicy; + using inner_y = + RAJA::expt::LoopPolicy; - using inner_z = RAJA::expt::LoopPolicy; + using inner_z = + RAJA::expt::LoopPolicy; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { RAJA::expt::launch( - RAJA::expt::DEVICE, - RAJA::expt::Grid(RAJA::expt::Teams(NE), - RAJA::expt::Threads(DPA_Q1D, DPA_Q1D, DPA_Q1D)), - [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + RAJA::expt::DEVICE, + RAJA::expt::Grid(RAJA::expt::Teams(NE), + RAJA::expt::Threads(DPA_Q1D, DPA_Q1D, DPA_Q1D)), + [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), [&](int e) { - DIFFUSION3DPA_0_GPU; + DIFFUSION3DPA_0_GPU; RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dz) { @@ -216,7 +187,7 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { - NEW_DIFFUSION3DPA_1; + DIFFUSION3DPA_1; } // lambda (dx) ); // RAJA::expt::loop @@ -227,14 +198,14 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 0), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 1), [&](int dz) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dy) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qx) { - NEW_DIFFUSION3DPA_2; + DIFFUSION3DPA_2; } // lambda (qx) ); // RAJA::expt::loop @@ -252,7 +223,7 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qx) { - NEW_DIFFUSION3DPA_3; + DIFFUSION3DPA_3; } // lambda (qx) ); // RAJA::expt::loop @@ -270,7 +241,7 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qx) { - NEW_DIFFUSION3DPA_4; + DIFFUSION3DPA_4; } // lambda (qx) ); // RAJA::expt::loop @@ -288,7 +259,7 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qx) { - NEW_DIFFUSION3DPA_5; + DIFFUSION3DPA_5; } // lambda (qx) ); // RAJA::expt::loop @@ -299,14 +270,14 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 0), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 1), [&](int dz) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int d) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int q) { - NEW_DIFFUSION3DPA_6; + DIFFUSION3DPA_6; } // lambda (q) ); // RAJA::expt::loop @@ -314,7 +285,7 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { ); //RAJA::expt::loop } // lambda (dz) ); //RAJA::expt::loop - + ctx.teamSync(); RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), @@ -324,7 +295,7 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { - NEW_DIFFUSION3DPA_7; + DIFFUSION3DPA_7; } // lambda (dx) ); // RAJA::expt::loop @@ -342,7 +313,7 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { - NEW_DIFFUSION3DPA_8; + DIFFUSION3DPA_8; } // lambda (dx) ); // RAJA::expt::loop @@ -360,7 +331,7 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { - NEW_DIFFUSION3DPA_9; + DIFFUSION3DPA_9; } // lambda (dx) ); // RAJA::expt::loop @@ -372,10 +343,10 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { } // lambda (e) ); // RAJA::expt::loop - } // outer lambda (ctx) - ); // RAJA::expt::launch + } // outer lambda (ctx) + ); // RAJA::expt::launch - } // loop over kernel reps + } // loop over kernel reps stopTimer(); DIFFUSION3DPA_DATA_TEARDOWN_CUDA; @@ -385,7 +356,8 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { default: { - getCout() << "\n DIFFUSION3DPA : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n DIFFUSION3DPA : Unknown Cuda variant id = " << vid + << std::endl; break; } } diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index d3919fa11..ab6e2c734 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -22,126 +22,100 @@ namespace rajaperf { namespace apps { -#define DIFFUSION3DPA_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(Basis, m_B, DPA_Q1D *DPA_D1D); \ - allocAndInitHipDeviceData(dBasis, m_G, DPA_Q1D *DPA_D1D); \ - allocAndInitHipDeviceData(D, m_D, DPA_Q1D *DPA_Q1D *DPA_Q1D *SYM *m_NE); \ - allocAndInitHipDeviceData(X, m_X, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ +#define DIFFUSION3DPA_DATA_SETUP_HIP \ + allocAndInitHipDeviceData(Basis, m_B, DPA_Q1D *DPA_D1D); \ + allocAndInitHipDeviceData(dBasis, m_G, DPA_Q1D *DPA_D1D); \ + allocAndInitHipDeviceData(D, m_D, DPA_Q1D *DPA_Q1D *DPA_Q1D *SYM *m_NE); \ + allocAndInitHipDeviceData(X, m_X, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ allocAndInitHipDeviceData(Y, m_Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); -#define DIFFUSION3DPA_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_Y, Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ - deallocHipDeviceData(Basis); \ - deallocHipDeviceData(dBasis); \ - deallocHipDeviceData(D); \ - deallocHipDeviceData(X); \ +#define DIFFUSION3DPA_DATA_TEARDOWN_HIP \ + getHipDeviceData(m_Y, Y, DPA_D1D *DPA_D1D *DPA_D1D *m_NE); \ + deallocHipDeviceData(Basis); \ + deallocHipDeviceData(dBasis); \ + deallocHipDeviceData(D); \ + deallocHipDeviceData(X); \ deallocHipDeviceData(Y); -__global__ void Diffusion3DPA(Index_type NE, const Real_ptr Basis, const Real_ptr dBasis, - const Real_ptr D, const Real_ptr X, Real_ptr Y, bool symmetric) { +__global__ void Diffusion3DPA(Index_type NE, const Real_ptr Basis, + const Real_ptr dBasis, const Real_ptr D, + const Real_ptr X, Real_ptr Y, bool symmetric) { const int e = hipBlockIdx_x; DIFFUSION3DPA_0_GPU; - GPU_FOREACH_THREAD(dz,z,DPA_D1D) - { - GPU_FOREACH_THREAD(dy,y,DPA_D1D) - { - GPU_FOREACH_THREAD(dx,x,DPA_D1D) - { - NEW_DIFFUSION3DPA_1; - } + GPU_FOREACH_THREAD(dz, z, DPA_D1D) { + GPU_FOREACH_THREAD(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + DIFFUSION3DPA_1; } + } } - if (threadIdx.z == 0) - { - GPU_FOREACH_THREAD(dy,y,DPA_D1D) - { - GPU_FOREACH_THREAD(qx,x,DPA_Q1D) - { - NEW_DIFFUSION3DPA_2; - } + if (threadIdx.z == 0) { + GPU_FOREACH_THREAD(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + DIFFUSION3DPA_2; } + } } __syncthreads(); - GPU_FOREACH_THREAD(dz,z,DPA_D1D) - { - GPU_FOREACH_THREAD(dy,y,DPA_D1D) - { - GPU_FOREACH_THREAD(qx,x,DPA_Q1D) - { - NEW_DIFFUSION3DPA_3; - } + GPU_FOREACH_THREAD(dz, z, DPA_D1D) { + GPU_FOREACH_THREAD(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + DIFFUSION3DPA_3; } + } } __syncthreads(); - GPU_FOREACH_THREAD(dz,z,DPA_D1D) - { - GPU_FOREACH_THREAD(qy,y,DPA_Q1D) - { - GPU_FOREACH_THREAD(qx,x,DPA_Q1D) - { - NEW_DIFFUSION3DPA_4; + GPU_FOREACH_THREAD(dz, z, DPA_D1D) { + GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { + GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + DIFFUSION3DPA_4; } } } __syncthreads(); - GPU_FOREACH_THREAD(qz,z,DPA_Q1D) - { - GPU_FOREACH_THREAD(qy,y,DPA_Q1D) - { - GPU_FOREACH_THREAD(qx,x,DPA_Q1D) - { - NEW_DIFFUSION3DPA_5; - } + GPU_FOREACH_THREAD(qz, z, DPA_Q1D) { + GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { + GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + DIFFUSION3DPA_5; + } } } __syncthreads(); - if (threadIdx.z == 0) - { - GPU_FOREACH_THREAD(d,y,DPA_D1D) - { - GPU_FOREACH_THREAD(q,x,DPA_Q1D) - { - NEW_DIFFUSION3DPA_6; - } + if (threadIdx.z == 0) { + GPU_FOREACH_THREAD(d, y, DPA_D1D) { + GPU_FOREACH_THREAD(q, x, DPA_Q1D) { + DIFFUSION3DPA_6; } + } } __syncthreads(); - GPU_FOREACH_THREAD(qz,z,DPA_Q1D) - { - GPU_FOREACH_THREAD(qy,y,DPA_Q1D) - { - GPU_FOREACH_THREAD(dx,x,DPA_D1D) - { - NEW_DIFFUSION3DPA_7; - } + GPU_FOREACH_THREAD(qz, z, DPA_Q1D) { + GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { + GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + DIFFUSION3DPA_7; } + } } __syncthreads(); - GPU_FOREACH_THREAD(qz,z,DPA_Q1D) - { - GPU_FOREACH_THREAD(dy,y,DPA_D1D) - { - GPU_FOREACH_THREAD(dx,x,DPA_D1D) - { - NEW_DIFFUSION3DPA_8; - } - } + GPU_FOREACH_THREAD(qz, z, DPA_Q1D) { + GPU_FOREACH_THREAD(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + DIFFUSION3DPA_8; } - __syncthreads(); - GPU_FOREACH_THREAD(dz,z,DPA_D1D) - { - GPU_FOREACH_THREAD(dy,y,DPA_D1D) - { - GPU_FOREACH_THREAD(dx,x,DPA_D1D) - { - NEW_DIFFUSION3DPA_9; - } - } + } + } + __syncthreads(); + GPU_FOREACH_THREAD(dz, z, DPA_D1D) { + GPU_FOREACH_THREAD(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + DIFFUSION3DPA_9; } + } + } } void DIFFUSION3DPA::runHipVariant(VariantID vid) { @@ -161,11 +135,10 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipLaunchKernelGGL((Diffusion3DPA), dim3(grid_size), dim3(block_size), 0, 0, - NE, Basis, dBasis, D, X, Y, symmetric); - - hipErrchk( hipGetLastError() ); + hipLaunchKernelGGL((Diffusion3DPA), dim3(grid_size), dim3(block_size), 0, + 0, NE, Basis, dBasis, D, X, Y, symmetric); + hipErrchk(hipGetLastError()); } stopTimer(); @@ -178,34 +151,30 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { DIFFUSION3DPA_DATA_SETUP_HIP; - using launch_policy = RAJA::expt::LaunchPolicy - >; + using launch_policy = + RAJA::expt::LaunchPolicy>; - using outer_x = RAJA::expt::LoopPolicy; + using outer_x = + RAJA::expt::LoopPolicy; - using inner_x = RAJA::expt::LoopPolicy; + using inner_x = + RAJA::expt::LoopPolicy; - using inner_y = RAJA::expt::LoopPolicy; + using inner_y = + RAJA::expt::LoopPolicy; - using inner_y = RAJA::expt::LoopPolicy; + using inner_z = + RAJA::expt::LoopPolicy; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { RAJA::expt::launch( - RAJA::expt::DEVICE, - RAJA::expt::Grid(RAJA::expt::Teams(NE), - RAJA::expt::Threads(DPA_Q1D, DPA_Q1D, DPA_Q1D)), - [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + RAJA::expt::DEVICE, + RAJA::expt::Grid(RAJA::expt::Teams(NE), + RAJA::expt::Threads(DPA_Q1D, DPA_Q1D, DPA_Q1D)), + [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), [&](int e) { @@ -219,7 +188,7 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { - NEW_DIFFUSION3DPA_1; + DIFFUSION3DPA_1; } // lambda (dx) ); // RAJA::expt::loop @@ -230,14 +199,14 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 0), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 1), [&](int dz) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dy) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qx) { - NEW_DIFFUSION3DPA_2; + DIFFUSION3DPA_2; } // lambda (qx) ); // RAJA::expt::loop @@ -255,7 +224,7 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qx) { - NEW_DIFFUSION3DPA_3; + DIFFUSION3DPA_3; } // lambda (qx) ); // RAJA::expt::loop @@ -273,7 +242,7 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qx) { - NEW_DIFFUSION3DPA_4; + DIFFUSION3DPA_4; } // lambda (qx) ); // RAJA::expt::loop @@ -291,7 +260,7 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qx) { - NEW_DIFFUSION3DPA_5; + DIFFUSION3DPA_5; } // lambda (qx) ); // RAJA::expt::loop @@ -302,14 +271,14 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 0), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 1), [&](int dz) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int d) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int q) { - NEW_DIFFUSION3DPA_6; + DIFFUSION3DPA_6; } // lambda (q) ); // RAJA::expt::loop @@ -317,7 +286,7 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { ); //RAJA::expt::loop } // lambda (dz) ); //RAJA::expt::loop - + ctx.teamSync(); RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), @@ -327,7 +296,7 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { - NEW_DIFFUSION3DPA_7; + DIFFUSION3DPA_7; } // lambda (dx) ); // RAJA::expt::loop @@ -345,7 +314,7 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { - NEW_DIFFUSION3DPA_8; + DIFFUSION3DPA_8; } // lambda (dx) ); // RAJA::expt::loop @@ -363,7 +332,7 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { - NEW_DIFFUSION3DPA_9; + DIFFUSION3DPA_9; } // lambda (dx) ); // RAJA::expt::loop @@ -375,10 +344,10 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { } // lambda (e) ); // RAJA::expt::loop - } // outer lambda (ctx) - ); // RAJA::expt::launch + } // outer lambda (ctx) + ); // RAJA::expt::launch - } // loop over kernel reps + } // loop over kernel reps stopTimer(); DIFFUSION3DPA_DATA_TEARDOWN_HIP; @@ -388,7 +357,8 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { default: { - getCout() << "\n DIFFUSION3DPA : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n DIFFUSION3DPA : Unknown Hip variant id = " << vid + << std::endl; break; } } diff --git a/src/apps/DIFFUSION3DPA-OMP.cpp b/src/apps/DIFFUSION3DPA-OMP.cpp index b0398bf67..e4195e9f6 100644 --- a/src/apps/DIFFUSION3DPA-OMP.cpp +++ b/src/apps/DIFFUSION3DPA-OMP.cpp @@ -18,8 +18,6 @@ namespace rajaperf { namespace apps { -#define MFEM_SYNC_THREAD - void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -39,103 +37,73 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { DIFFUSION3DPA_0_CPU; - CPU_FOREACH(dz,z,DPA_D1D) - { - CPU_FOREACH(dy,y,DPA_D1D) - { - CPU_FOREACH(dx,x,DPA_D1D) - { - NEW_DIFFUSION3DPA_1; + CPU_FOREACH(dz, z, DPA_D1D) { + CPU_FOREACH(dy, y, DPA_D1D) { + CPU_FOREACH(dx, x, DPA_D1D) { + DIFFUSION3DPA_1; } } } - //if (MFEM_THREAD_ID(z) == 0) - //{ - CPU_FOREACH(dy,y,DPA_D1D) - { - CPU_FOREACH(qx,x,DPA_Q1D) - { - NEW_DIFFUSION3DPA_2; - } - } - //} - MFEM_SYNC_THREAD; - CPU_FOREACH(dz,z,DPA_D1D) - { - CPU_FOREACH(dy,y,DPA_D1D) - { - CPU_FOREACH(qx,x,DPA_Q1D) - { - NEW_DIFFUSION3DPA_3; - } - } - } - MFEM_SYNC_THREAD; - CPU_FOREACH(dz,z,DPA_D1D) - { - CPU_FOREACH(qy,y,DPA_Q1D) - { - CPU_FOREACH(qx,x,DPA_Q1D) - { - NEW_DIFFUSION3DPA_4; + + CPU_FOREACH(dy, y, DPA_D1D) { + CPU_FOREACH(qx, x, DPA_Q1D) { + DIFFUSION3DPA_2; + } + } + + CPU_FOREACH(dz, z, DPA_D1D) { + CPU_FOREACH(dy, y, DPA_D1D) { + CPU_FOREACH(qx, x, DPA_Q1D) { + DIFFUSION3DPA_3; } - } - } - MFEM_SYNC_THREAD; - CPU_FOREACH(qz,z,DPA_Q1D) - { - CPU_FOREACH(qy,y,DPA_Q1D) - { - CPU_FOREACH(qx,x,DPA_Q1D) - { - NEW_DIFFUSION3DPA_5; + } + } + + CPU_FOREACH(dz, z, DPA_D1D) { + CPU_FOREACH(qy, y, DPA_Q1D) { + CPU_FOREACH(qx, x, DPA_Q1D) { + DIFFUSION3DPA_4; } - } - } - MFEM_SYNC_THREAD; - //if (MFEM_THREAD_ID(z) == 0) - //{ - CPU_FOREACH(d,y,DPA_D1D) - { - CPU_FOREACH(q,x,DPA_Q1D) - { - NEW_DIFFUSION3DPA_6; + } + } + + CPU_FOREACH(qz, z, DPA_Q1D) { + CPU_FOREACH(qy, y, DPA_Q1D) { + CPU_FOREACH(qx, x, DPA_Q1D) { + DIFFUSION3DPA_5; } - } - //} - MFEM_SYNC_THREAD; - CPU_FOREACH(qz,z,DPA_Q1D) - { - CPU_FOREACH(qy,y,DPA_Q1D) - { - CPU_FOREACH(dx,x,DPA_D1D) - { - NEW_DIFFUSION3DPA_7; + } + } + + CPU_FOREACH(d, y, DPA_D1D) { + CPU_FOREACH(q, x, DPA_Q1D) { + DIFFUSION3DPA_6; + } + } + + CPU_FOREACH(qz, z, DPA_Q1D) { + CPU_FOREACH(qy, y, DPA_Q1D) { + CPU_FOREACH(dx, x, DPA_D1D) { + DIFFUSION3DPA_7; } - } - } - MFEM_SYNC_THREAD; - CPU_FOREACH(qz,z,DPA_Q1D) - { - CPU_FOREACH(dy,y,DPA_D1D) - { - CPU_FOREACH(dx,x,DPA_D1D) - { - NEW_DIFFUSION3DPA_8; + } + } + + CPU_FOREACH(qz, z, DPA_Q1D) { + CPU_FOREACH(dy, y, DPA_D1D) { + CPU_FOREACH(dx, x, DPA_D1D) { + DIFFUSION3DPA_8; } - } - } - MFEM_SYNC_THREAD; - CPU_FOREACH(dz,z,DPA_D1D) - { - CPU_FOREACH(dy,y,DPA_D1D) - { - CPU_FOREACH(dx,x,DPA_D1D) - { - NEW_DIFFUSION3DPA_9; + } + } + + CPU_FOREACH(dz, z, DPA_D1D) { + CPU_FOREACH(dy, y, DPA_D1D) { + CPU_FOREACH(dx, x, DPA_D1D) { + DIFFUSION3DPA_9; } - } - } + } + } } // element loop } @@ -146,58 +114,63 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { case RAJA_OpenMP: { - //Currently Teams requires two policies if compiled with a device + // Currently Teams requires two policies if compiled with a device using launch_policy = RAJA::expt::LaunchPolicy; using outer_x = RAJA::expt::LoopPolicy; using inner_x = RAJA::expt::LoopPolicy; + >; using inner_y = RAJA::expt::LoopPolicy; + >; using inner_z = RAJA::expt::LoopPolicy; + >; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - //Grid is empty as the host does not need a compute grid to be specified + // Grid is empty as the host does not need a compute grid to be specified RAJA::expt::launch( - RAJA::expt::HOST, RAJA::expt::Grid(), - [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + RAJA::expt::HOST, RAJA::expt::Grid(), + [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), [&](int e) { DIFFUSION3DPA_0_CPU; - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dz) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dy) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { - NEW_DIFFUSION3DPA_1; + DIFFUSION3DPA_1; } // lambda (dx) ); // RAJA::expt::loop @@ -208,14 +181,14 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 0), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 1), [&](int dz) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dy) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qx) { - NEW_DIFFUSION3DPA_2; + DIFFUSION3DPA_2; } // lambda (qx) ); // RAJA::expt::loop @@ -233,7 +206,7 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qx) { - NEW_DIFFUSION3DPA_3; + DIFFUSION3DPA_3; } // lambda (qx) ); // RAJA::expt::loop @@ -251,7 +224,7 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qx) { - NEW_DIFFUSION3DPA_4; + DIFFUSION3DPA_4; } // lambda (qx) ); // RAJA::expt::loop @@ -269,7 +242,7 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qx) { - NEW_DIFFUSION3DPA_5; + DIFFUSION3DPA_5; } // lambda (qx) ); // RAJA::expt::loop @@ -280,14 +253,14 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 0), + RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 1), [&](int dz) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int d) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int q) { - NEW_DIFFUSION3DPA_6; + DIFFUSION3DPA_6; } // lambda (q) ); // RAJA::expt::loop @@ -295,7 +268,7 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { ); //RAJA::expt::loop } // lambda (dz) ); //RAJA::expt::loop - + ctx.teamSync(); RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), @@ -305,7 +278,7 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { - NEW_DIFFUSION3DPA_7; + DIFFUSION3DPA_7; } // lambda (dx) ); // RAJA::expt::loop @@ -323,7 +296,7 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { - NEW_DIFFUSION3DPA_8; + DIFFUSION3DPA_8; } // lambda (dx) ); // RAJA::expt::loop @@ -341,7 +314,7 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { - NEW_DIFFUSION3DPA_9; + DIFFUSION3DPA_9; } // lambda (dx) ); // RAJA::expt::loop @@ -353,8 +326,8 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { } // lambda (e) ); // RAJA::expt::loop - } // outer lambda (ctx) - ); // RAJA::expt::launch + } // outer lambda (ctx) + ); // RAJA::expt::launch } // loop over kernel reps stopTimer(); diff --git a/src/apps/DIFFUSION3DPA-OMPTarget.cpp b/src/apps/DIFFUSION3DPA-OMPTarget.cpp index 862699345..8d3368002 100644 --- a/src/apps/DIFFUSION3DPA-OMPTarget.cpp +++ b/src/apps/DIFFUSION3DPA-OMPTarget.cpp @@ -19,7 +19,6 @@ namespace rajaperf { namespace apps { - void DIFFUSION3DPA::runOpenMPTargetVariant(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -27,7 +26,8 @@ void DIFFUSION3DPA::runOpenMPTargetVariant(VariantID vid) { default: { - getCout() << "\n DIFFUSION3DPA : Unknown OpenMPTarget variant id = " << vid << std::endl; + getCout() << "\n DIFFUSION3DPA : Unknown OpenMPTarget variant id = " << vid + << std::endl; break; } } diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index 2f5ecfb10..21a7678ca 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -18,8 +18,6 @@ namespace rajaperf { namespace apps { -#define MFEM_SYNC_THREAD - void DIFFUSION3DPA::runSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -36,103 +34,73 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { DIFFUSION3DPA_0_CPU; - CPU_FOREACH(dz,z,DPA_D1D) - { - CPU_FOREACH(dy,y,DPA_D1D) - { - CPU_FOREACH(dx,x,DPA_D1D) - { - NEW_DIFFUSION3DPA_1; + CPU_FOREACH(dz, z, DPA_D1D) { + CPU_FOREACH(dy, y, DPA_D1D) { + CPU_FOREACH(dx, x, DPA_D1D) { + DIFFUSION3DPA_1; } } } - //if (MFEM_THREAD_ID(z) == 0) - //{ - CPU_FOREACH(dy,y,DPA_D1D) - { - CPU_FOREACH(qx,x,DPA_Q1D) - { - NEW_DIFFUSION3DPA_2; - } - } - //} - MFEM_SYNC_THREAD; - CPU_FOREACH(dz,z,DPA_D1D) - { - CPU_FOREACH(dy,y,DPA_D1D) - { - CPU_FOREACH(qx,x,DPA_Q1D) - { - NEW_DIFFUSION3DPA_3; - } - } - } - MFEM_SYNC_THREAD; - CPU_FOREACH(dz,z,DPA_D1D) - { - CPU_FOREACH(qy,y,DPA_Q1D) - { - CPU_FOREACH(qx,x,DPA_Q1D) - { - NEW_DIFFUSION3DPA_4; + + CPU_FOREACH(dy, y, DPA_D1D) { + CPU_FOREACH(qx, x, DPA_Q1D) { + DIFFUSION3DPA_2; + } + } + + CPU_FOREACH(dz, z, DPA_D1D) { + CPU_FOREACH(dy, y, DPA_D1D) { + CPU_FOREACH(qx, x, DPA_Q1D) { + DIFFUSION3DPA_3; } - } - } - MFEM_SYNC_THREAD; - CPU_FOREACH(qz,z,DPA_Q1D) - { - CPU_FOREACH(qy,y,DPA_Q1D) - { - CPU_FOREACH(qx,x,DPA_Q1D) - { - NEW_DIFFUSION3DPA_5; + } + } + + CPU_FOREACH(dz, z, DPA_D1D) { + CPU_FOREACH(qy, y, DPA_Q1D) { + CPU_FOREACH(qx, x, DPA_Q1D) { + DIFFUSION3DPA_4; } - } - } - MFEM_SYNC_THREAD; - //if (MFEM_THREAD_ID(z) == 0) - //{ - CPU_FOREACH(d,y,DPA_D1D) - { - CPU_FOREACH(q,x,DPA_Q1D) - { - NEW_DIFFUSION3DPA_6; + } + } + + CPU_FOREACH(qz, z, DPA_Q1D) { + CPU_FOREACH(qy, y, DPA_Q1D) { + CPU_FOREACH(qx, x, DPA_Q1D) { + DIFFUSION3DPA_5; } - } - //} - MFEM_SYNC_THREAD; - CPU_FOREACH(qz,z,DPA_Q1D) - { - CPU_FOREACH(qy,y,DPA_Q1D) - { - CPU_FOREACH(dx,x,DPA_D1D) - { - NEW_DIFFUSION3DPA_7; + } + } + + CPU_FOREACH(d, y, DPA_D1D) { + CPU_FOREACH(q, x, DPA_Q1D) { + DIFFUSION3DPA_6; + } + } + + CPU_FOREACH(qz, z, DPA_Q1D) { + CPU_FOREACH(qy, y, DPA_Q1D) { + CPU_FOREACH(dx, x, DPA_D1D) { + DIFFUSION3DPA_7; } - } - } - MFEM_SYNC_THREAD; - CPU_FOREACH(qz,z,DPA_Q1D) - { - CPU_FOREACH(dy,y,DPA_D1D) - { - CPU_FOREACH(dx,x,DPA_D1D) - { - NEW_DIFFUSION3DPA_8; + } + } + + CPU_FOREACH(qz, z, DPA_Q1D) { + CPU_FOREACH(dy, y, DPA_D1D) { + CPU_FOREACH(dx, x, DPA_D1D) { + DIFFUSION3DPA_8; } - } - } - MFEM_SYNC_THREAD; - CPU_FOREACH(dz,z,DPA_D1D) - { - CPU_FOREACH(dy,y,DPA_D1D) - { - CPU_FOREACH(dx,x,DPA_D1D) - { - NEW_DIFFUSION3DPA_9; + } + } + + CPU_FOREACH(dz, z, DPA_D1D) { + CPU_FOREACH(dy, y, DPA_D1D) { + CPU_FOREACH(dx, x, DPA_D1D) { + DIFFUSION3DPA_9; } - } - } + } + } } // element loop } @@ -183,10 +151,10 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - //Grid is empty as the host does not need a compute grid to be specified + // Grid is empty as the host does not need a compute grid to be specified RAJA::expt::launch( - RAJA::expt::HOST, RAJA::expt::Grid(), - [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + RAJA::expt::HOST, RAJA::expt::Grid(), + [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), [&](int e) { @@ -200,7 +168,7 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { - NEW_DIFFUSION3DPA_1; + DIFFUSION3DPA_1; } // lambda (dx) ); // RAJA::expt::loop @@ -218,7 +186,7 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qx) { - NEW_DIFFUSION3DPA_2; + DIFFUSION3DPA_2; } // lambda (qx) ); // RAJA::expt::loop @@ -236,7 +204,7 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qx) { - NEW_DIFFUSION3DPA_3; + DIFFUSION3DPA_3; } // lambda (qx) ); // RAJA::expt::loop @@ -254,7 +222,7 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qx) { - NEW_DIFFUSION3DPA_4; + DIFFUSION3DPA_4; } // lambda (qx) ); // RAJA::expt::loop @@ -272,7 +240,7 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int qx) { - NEW_DIFFUSION3DPA_5; + DIFFUSION3DPA_5; } // lambda (qx) ); // RAJA::expt::loop @@ -290,7 +258,7 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](int q) { - NEW_DIFFUSION3DPA_6; + DIFFUSION3DPA_6; } // lambda (q) ); // RAJA::expt::loop @@ -298,7 +266,7 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { ); //RAJA::expt::loop } // lambda (dz) ); //RAJA::expt::loop - + ctx.teamSync(); RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), @@ -308,7 +276,7 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { - NEW_DIFFUSION3DPA_7; + DIFFUSION3DPA_7; } // lambda (dx) ); // RAJA::expt::loop @@ -326,7 +294,7 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { - NEW_DIFFUSION3DPA_8; + DIFFUSION3DPA_8; } // lambda (dx) ); // RAJA::expt::loop @@ -344,7 +312,7 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dx) { - NEW_DIFFUSION3DPA_9; + DIFFUSION3DPA_9; } // lambda (dx) ); // RAJA::expt::loop @@ -356,9 +324,9 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { } // lambda (e) ); // RAJA::expt::loop - } // outer lambda (ctx) - ); // RAJA::expt::launch - } // loop over kernel reps + } // outer lambda (ctx) + ); // RAJA::expt::launch + } // loop over kernel reps stopTimer(); return; diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index 85b592f5f..5c927f2f9 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -41,113 +41,81 @@ /// double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1); /// double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); /// -/// for(int dy=0; dy Date: Wed, 15 Dec 2021 15:46:56 -0800 Subject: [PATCH 175/392] remove old definitions of kernels --- src/apps/DIFFUSION3DPA.hpp | 203 ------------------------------------- 1 file changed, 203 deletions(-) diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index 5c927f2f9..f251ee16e 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -322,24 +322,9 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); #define DIFFUSION3DPA_1 \ - RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < DPA_D1D; ++dz) \ - { \ - s_X[dz][dy][dx] = dpaX_(dx,dy,dz,e); \ - } - -#define NEW_DIFFUSION3DPA_1 \ s_X[dz][dy][dx] = dpaX_(dx,dy,dz,e); #define DIFFUSION3DPA_2 \ - const int i = qi(qx,dy,DPA_Q1D); \ - const int j = dj(qx,dy,DPA_D1D); \ - const int k = qk(qx,dy,DPA_Q1D); \ - const int l = dl(qx,dy,DPA_D1D); \ - B[i][j] = b(qx,dy); \ - G[k][l] = g(qx,dy) * sign(qx,dy); - -#define NEW_DIFFUSION3DPA_2 \ const int i = qi(qx,dy,DPA_Q1D); \ const int j = dj(qx,dy,DPA_D1D); \ const int k = qk(qx,dy,DPA_Q1D); \ @@ -348,33 +333,6 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) G[k][l] = g(qx,dy) * sign(qx,dy); \ #define DIFFUSION3DPA_3 \ - double u[DPA_D1D], v[DPA_D1D]; \ - RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < DPA_D1D; dz++) { u[dz] = v[dz] = 0.0; } \ - RAJA_UNROLL(MD1) \ - for (int dx = 0; dx < DPA_D1D; ++dx) \ - { \ - const int i = qi(qx,dx,DPA_Q1D); \ - const int j = dj(qx,dx,DPA_D1D); \ - const int k = qk(qx,dx,DPA_Q1D); \ - const int l = dl(qx,dx,DPA_D1D); \ - const double s = sign(qx,dx); \ - RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < DPA_D1D; ++dz) \ - { \ - const double coords = s_X[dz][dy][dx]; \ - u[dz] += coords * B[i][j]; \ - v[dz] += coords * G[k][l] * s; \ - } \ - } \ - RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < DPA_D1D; ++dz) \ - { \ - DDQ0[dz][dy][qx] = u[dz]; \ - DDQ1[dz][dy][qx] = v[dz]; \ - } - -#define NEW_DIFFUSION3DPA_3 \ double u = 0.0, v = 0.0; \ RAJA_UNROLL(MD1) \ for (int dx = 0; dx < DPA_D1D; ++dx) \ @@ -391,36 +349,7 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) DDQ0[dz][dy][qx] = u; \ DDQ1[dz][dy][qx] = v; - #define DIFFUSION3DPA_4 \ - double u[DPA_D1D], v[DPA_D1D], w[DPA_D1D]; \ - RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < DPA_D1D; dz++) { u[dz] = v[dz] = w[dz] = 0.0; } \ - RAJA_UNROLL(MD1) \ - for (int dy = 0; dy < DPA_D1D; ++dy) \ - { \ - const int i = qi(qy,dy,DPA_Q1D); \ - const int j = dj(qy,dy,DPA_D1D); \ - const int k = qk(qy,dy,DPA_Q1D); \ - const int l = dl(qy,dy,DPA_D1D); \ - const double s = sign(qy,dy); \ - RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < DPA_D1D; dz++) \ - { \ - u[dz] += DDQ1[dz][dy][qx] * B[i][j]; \ - v[dz] += DDQ0[dz][dy][qx] * G[k][l] * s; \ - w[dz] += DDQ0[dz][dy][qx] * B[i][j]; \ - } \ - } \ - RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < DPA_D1D; dz++) \ - { \ - DQQ0[dz][qy][qx] = u[dz]; \ - DQQ1[dz][qy][qx] = v[dz]; \ - DQQ2[dz][qy][qx] = w[dz]; \ - } - -#define NEW_DIFFUSION3DPA_4 \ double u = 0.0, v = 0.0, w = 0.0; \ RAJA_UNROLL(MD1) \ for (int dy = 0; dy < DPA_D1D; ++dy) \ @@ -439,46 +368,6 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) DQQ2[dz][qy][qx] = w; #define DIFFUSION3DPA_5 \ - double u[DPA_Q1D], v[DPA_Q1D], w[DPA_Q1D]; \ - RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < DPA_Q1D; qz++) { u[qz] = v[qz] = w[qz] = 0.0; } \ - RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < DPA_D1D; ++dz) \ - { \ - RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < DPA_Q1D; qz++) \ - { \ - const int i = qi(qz,dz,DPA_Q1D); \ - const int j = dj(qz,dz,DPA_D1D); \ - const int k = qk(qz,dz,DPA_Q1D); \ - const int l = dl(qz,dz,DPA_D1D); \ - const double s = sign(qz,dz); \ - u[qz] += DQQ0[dz][qy][qx] * B[i][j]; \ - v[qz] += DQQ1[dz][qy][qx] * B[i][j]; \ - w[qz] += DQQ2[dz][qy][qx] * G[k][l] * s; \ - } \ - } \ - RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < DPA_Q1D; qz++) \ - { \ - const double O11 = d(qx,qy,qz,0,e); \ - const double O12 = d(qx,qy,qz,1,e); \ - const double O13 = d(qx,qy,qz,2,e); \ - const double O21 = symmetric ? O12 : d(qx,qy,qz,3,e); \ - const double O22 = symmetric ? d(qx,qy,qz,3,e) : d(qx,qy,qz,4,e); \ - const double O23 = symmetric ? d(qx,qy,qz,4,e) : d(qx,qy,qz,5,e); \ - const double O31 = symmetric ? O13 : d(qx,qy,qz,6,e); \ - const double O32 = symmetric ? O23 : d(qx,qy,qz,7,e); \ - const double O33 = symmetric ? d(qx,qy,qz,5,e) : d(qx,qy,qz,8,e); \ - const double gX = u[qz]; \ - const double gY = v[qz]; \ - const double gZ = w[qz]; \ - QQQ0[qz][qy][qx] = (O11*gX) + (O12*gY) + (O13*gZ); \ - QQQ1[qz][qy][qx] = (O21*gX) + (O22*gY) + (O23*gZ); \ - QQQ2[qz][qy][qx] = (O31*gX) + (O32*gY) + (O33*gZ); \ - } - -#define NEW_DIFFUSION3DPA_5 \ double u = 0.0, v = 0.0, w = 0.0; \ RAJA_UNROLL(MD1) \ for (int dz = 0; dz < DPA_D1D; ++dz) \ @@ -508,17 +397,7 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) QQQ1[qz][qy][qx] = (O21*gX) + (O22*gY) + (O23*gZ); \ QQQ2[qz][qy][qx] = (O31*gX) + (O32*gY) + (O33*gZ); - #define DIFFUSION3DPA_6 \ - const int i = qi(q,d,DPA_Q1D); \ - const int j = dj(q,d,DPA_D1D); \ - const int k = qk(q,d,DPA_Q1D); \ - const int l = dl(q,d,DPA_D1D); \ - Bt[j][i] = b(q,d); \ - Gt[l][k] = g(q,d) * sign(q,d); - - -#define NEW_DIFFUSION3DPA_6 \ const int i = qi(q,d,DPA_Q1D); \ const int j = dj(q,d,DPA_D1D); \ const int k = qk(q,d,DPA_Q1D); \ @@ -527,34 +406,6 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) Gt[l][k] = g(q,d) * sign(q,d); #define DIFFUSION3DPA_7 \ - double u[DPA_Q1D], v[DPA_Q1D], w[DPA_Q1D]; \ - RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < DPA_Q1D; ++qz) { u[qz] = v[qz] = w[qz] = 0.0; } \ - RAJA_UNROLL(MQ1) \ - for (int qx = 0; qx < DPA_Q1D; ++qx) \ - { \ - const int i = qi(qx,dx,DPA_Q1D); \ - const int j = dj(qx,dx,DPA_D1D); \ - const int k = qk(qx,dx,DPA_Q1D); \ - const int l = dl(qx,dx,DPA_D1D); \ - const double s = sign(qx,dx); \ - RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < DPA_Q1D; ++qz) \ - { \ - u[qz] += QQQ0[qz][qy][qx] * Gt[l][k] * s; \ - v[qz] += QQQ1[qz][qy][qx] * Bt[j][i]; \ - w[qz] += QQQ2[qz][qy][qx] * Bt[j][i]; \ - } \ - } \ - RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < DPA_Q1D; ++qz) \ - { \ - QQD0[qz][qy][dx] = u[qz]; \ - QQD1[qz][qy][dx] = v[qz]; \ - QQD2[qz][qy][dx] = w[qz]; \ - } - -#define NEW_DIFFUSION3DPA_7 \ double u = 0.0, v = 0.0, w = 0.0; \ RAJA_UNROLL(MQ1) \ for (int qx = 0; qx < DPA_Q1D; ++qx) \ @@ -573,34 +424,6 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) QQD2[qz][qy][dx] = w; #define DIFFUSION3DPA_8 \ - double u[DPA_Q1D], v[DPA_Q1D], w[DPA_Q1D]; \ - RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < DPA_Q1D; ++qz) { u[qz] = v[qz] = w[qz] = 0.0; } \ - RAJA_UNROLL(MQ1) \ - for (int qy = 0; qy < DPA_Q1D; ++qy) \ - { \ - const int i = qi(qy,dy,DPA_Q1D); \ - const int j = dj(qy,dy,DPA_D1D); \ - const int k = qk(qy,dy,DPA_Q1D); \ - const int l = dl(qy,dy,DPA_D1D); \ - const double s = sign(qy,dy); \ - RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < DPA_Q1D; ++qz) \ - { \ - u[qz] += QQD0[qz][qy][dx] * Bt[j][i]; \ - v[qz] += QQD1[qz][qy][dx] * Gt[l][k] * s; \ - w[qz] += QQD2[qz][qy][dx] * Bt[j][i]; \ - } \ - } \ - RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < DPA_Q1D; ++qz) \ - { \ - QDD0[qz][dy][dx] = u[qz]; \ - QDD1[qz][dy][dx] = v[qz]; \ - QDD2[qz][dy][dx] = w[qz]; \ - } - -#define NEW_DIFFUSION3DPA_8 \ double u = 0.0, v = 0.0, w = 0.0; \ RAJA_UNROLL(DPA_Q1D) \ for (int qy = 0; qy < DPA_Q1D; ++qy) \ @@ -619,32 +442,6 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) QDD2[qz][dy][dx] = w; #define DIFFUSION3DPA_9 \ - double u[DPA_D1D], v[DPA_D1D], w[DPA_D1D]; \ - RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < DPA_D1D; ++dz) { u[dz] = v[dz] = w[dz] = 0.0; } \ - RAJA_UNROLL(MQ1) \ - for (int qz = 0; qz < DPA_Q1D; ++qz) \ - { \ - RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < DPA_D1D; ++dz) \ - { \ - const int i = qi(qz,dz,DPA_Q1D); \ - const int j = dj(qz,dz,DPA_D1D); \ - const int k = qk(qz,dz,DPA_Q1D); \ - const int l = dl(qz,dz,DPA_D1D); \ - const double s = sign(qz,dz); \ - u[dz] += QDD0[qz][dy][dx] * Bt[j][i]; \ - v[dz] += QDD1[qz][dy][dx] * Bt[j][i]; \ - w[dz] += QDD2[qz][dy][dx] * Gt[l][k] * s;\ - } \ - } \ - RAJA_UNROLL(MD1) \ - for (int dz = 0; dz < DPA_D1D; ++dz) \ - { \ - dpaY_(dx,dy,dz,e) += (u[dz] + v[dz] + w[dz]); \ - } - -#define NEW_DIFFUSION3DPA_9 \ double u = 0.0, v = 0.0, w = 0.0; \ RAJA_UNROLL(MQ1) \ for (int qz = 0; qz < DPA_Q1D; ++qz) \ From ab7e39fc7b42b27aba679f3ab8efe1ba3c3d7e80 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 28 Jan 2022 17:00:31 -0800 Subject: [PATCH 176/392] Add DAXPY_ATOMIC kernel --- src/CMakeLists.txt | 3 + src/basic/CMakeLists.txt | 6 ++ src/basic/DAXPY_ATOMIC-Cuda.cpp | 118 ++++++++++++++++++++++++++ src/basic/DAXPY_ATOMIC-Hip.cpp | 121 +++++++++++++++++++++++++++ src/basic/DAXPY_ATOMIC-OMP.cpp | 99 ++++++++++++++++++++++ src/basic/DAXPY_ATOMIC-OMPTarget.cpp | 94 +++++++++++++++++++++ src/basic/DAXPY_ATOMIC-Seq.cpp | 93 ++++++++++++++++++++ src/basic/DAXPY_ATOMIC.cpp | 80 ++++++++++++++++++ src/basic/DAXPY_ATOMIC.hpp | 68 +++++++++++++++ src/common/RAJAPerfSuite.cpp | 6 ++ src/common/RAJAPerfSuite.hpp | 1 + 11 files changed, 689 insertions(+) create mode 100644 src/basic/DAXPY_ATOMIC-Cuda.cpp create mode 100644 src/basic/DAXPY_ATOMIC-Hip.cpp create mode 100644 src/basic/DAXPY_ATOMIC-OMP.cpp create mode 100644 src/basic/DAXPY_ATOMIC-OMPTarget.cpp create mode 100644 src/basic/DAXPY_ATOMIC-Seq.cpp create mode 100644 src/basic/DAXPY_ATOMIC.cpp create mode 100644 src/basic/DAXPY_ATOMIC.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7be063470..74b945bbb 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -73,6 +73,9 @@ blt_add_executable( basic/DAXPY.cpp basic/DAXPY-Seq.cpp basic/DAXPY-OMPTarget.cpp + basic/DAXPY_ATOMIC.cpp + basic/DAXPY_ATOMIC-Seq.cpp + basic/DAXPY_ATOMIC-OMPTarget.cpp basic/IF_QUAD.cpp basic/IF_QUAD-Seq.cpp basic/IF_QUAD-OMPTarget.cpp diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index 250529814..8aa475242 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -14,6 +14,12 @@ blt_add_library( DAXPY-Cuda.cpp DAXPY-OMP.cpp DAXPY-OMPTarget.cpp + DAXPY_ATOMIC.cpp + DAXPY_ATOMIC-Seq.cpp + DAXPY_ATOMIC-Hip.cpp + DAXPY_ATOMIC-Cuda.cpp + DAXPY_ATOMIC-OMP.cpp + DAXPY_ATOMIC-OMPTarget.cpp IF_QUAD.cpp IF_QUAD-Seq.cpp IF_QUAD-Hip.cpp diff --git a/src/basic/DAXPY_ATOMIC-Cuda.cpp b/src/basic/DAXPY_ATOMIC-Cuda.cpp new file mode 100644 index 000000000..11f230177 --- /dev/null +++ b/src/basic/DAXPY_ATOMIC-Cuda.cpp @@ -0,0 +1,118 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DAXPY_ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for CUDA execution + // + const size_t block_size = 256; + + +#define DAXPY_ATOMIC_DATA_SETUP_CUDA \ + allocAndInitCudaDeviceData(x, m_x, iend); \ + allocAndInitCudaDeviceData(y, m_y, iend); + +#define DAXPY_ATOMIC_DATA_TEARDOWN_CUDA \ + getCudaDeviceData(m_y, y, iend); \ + deallocCudaDeviceData(x); \ + deallocCudaDeviceData(y); + +__global__ void daxpy_atomic(Real_ptr y, Real_ptr x, + Real_type a, + Index_type iend) +{ + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < iend) { + DAXPY_ATOMIC_RAJA_BODY(RAJA::cuda_atomic); + } +} + +void DAXPY_ATOMIC::runCudaVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + DAXPY_ATOMIC_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + DAXPY_ATOMIC_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + daxpy_atomic<<>>( y, x, a, + iend ); + cudaErrchk( cudaGetLastError() ); + + } + stopTimer(); + + DAXPY_ATOMIC_DATA_TEARDOWN_CUDA; + + } else if ( vid == Lambda_CUDA ) { + + DAXPY_ATOMIC_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + lambda_cuda_forall<<>>( + ibegin, iend, [=] __device__ (Index_type i) { + DAXPY_ATOMIC_RAJA_BODY(RAJA::cuda_atomic); + }); + cudaErrchk( cudaGetLastError() ); + + } + stopTimer(); + + DAXPY_ATOMIC_DATA_TEARDOWN_CUDA; + + } else if ( vid == RAJA_CUDA ) { + + DAXPY_ATOMIC_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::cuda_exec >( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + DAXPY_ATOMIC_RAJA_BODY(RAJA::cuda_atomic); + }); + + } + stopTimer(); + + DAXPY_ATOMIC_DATA_TEARDOWN_CUDA; + + } else { + getCout() << "\n DAXPY_ATOMIC : Unknown Cuda variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/basic/DAXPY_ATOMIC-Hip.cpp b/src/basic/DAXPY_ATOMIC-Hip.cpp new file mode 100644 index 000000000..5035c51d9 --- /dev/null +++ b/src/basic/DAXPY_ATOMIC-Hip.cpp @@ -0,0 +1,121 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DAXPY_ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for HIP execution + // + const size_t block_size = 256; + + +#define DAXPY_ATOMIC_DATA_SETUP_HIP \ + allocAndInitHipDeviceData(x, m_x, iend); \ + allocAndInitHipDeviceData(y, m_y, iend); + +#define DAXPY_ATOMIC_DATA_TEARDOWN_HIP \ + getHipDeviceData(m_y, y, iend); \ + deallocHipDeviceData(x); \ + deallocHipDeviceData(y); + +__global__ void daxpy_atomic(Real_ptr y, Real_ptr x, + Real_type a, + Index_type iend) +{ + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < iend) { + DAXPY_ATOMIC_RAJA_BODY(RAJA::hip_atomic); + } +} + + +void DAXPY_ATOMIC::runHipVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + DAXPY_ATOMIC_DATA_SETUP; + + if ( vid == Base_HIP ) { + + DAXPY_ATOMIC_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + hipLaunchKernelGGL((daxpy_atomic),dim3(grid_size), dim3(block_size), 0, 0, y, x, a, + iend ); + hipErrchk( hipGetLastError() ); + + } + stopTimer(); + + DAXPY_ATOMIC_DATA_TEARDOWN_HIP; + + } else if ( vid == Lambda_HIP ) { + + DAXPY_ATOMIC_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + auto daxpy_atomic_lambda = [=] __device__ (Index_type i) { + DAXPY_ATOMIC_RAJA_BODY(RAJA::hip_atomic); + }; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + hipLaunchKernelGGL(lambda_hip_forall, + grid_size, block_size, 0, 0, ibegin, iend, daxpy_atomic_lambda); + hipErrchk( hipGetLastError() ); + + } + stopTimer(); + + DAXPY_ATOMIC_DATA_TEARDOWN_HIP; + + } else if ( vid == RAJA_HIP ) { + + DAXPY_ATOMIC_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::hip_exec >( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + DAXPY_ATOMIC_RAJA_BODY(RAJA::hip_atomic); + }); + + } + stopTimer(); + + DAXPY_ATOMIC_DATA_TEARDOWN_HIP; + + } else { + getCout() << "\n DAXPY_ATOMIC : Unknown Hip variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/basic/DAXPY_ATOMIC-OMP.cpp b/src/basic/DAXPY_ATOMIC-OMP.cpp new file mode 100644 index 000000000..d12bd99b9 --- /dev/null +++ b/src/basic/DAXPY_ATOMIC-OMP.cpp @@ -0,0 +1,99 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DAXPY_ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +void DAXPY_ATOMIC::runOpenMPVariant(VariantID vid) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + DAXPY_ATOMIC_DATA_SETUP; + + switch ( vid ) { + + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + #pragma omp atomic + y[i] += a * x[i] ; + } + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP : { + + auto daxpy_atomic_lam = [=](Index_type i) { + #pragma omp atomic + y[i] += a * x[i] ; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + daxpy_atomic_lam(i); + } + + } + stopTimer(); + + break; + } + + case RAJA_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + DAXPY_ATOMIC_RAJA_BODY(RAJA::omp_atomic); + }); + + } + stopTimer(); + + break; + } + + default : { + getCout() << "\n DAXPY_ATOMIC : Unknown variant id = " << vid << std::endl; + } + + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/DAXPY_ATOMIC-OMPTarget.cpp b/src/basic/DAXPY_ATOMIC-OMPTarget.cpp new file mode 100644 index 000000000..e4beb5920 --- /dev/null +++ b/src/basic/DAXPY_ATOMIC-OMPTarget.cpp @@ -0,0 +1,94 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DAXPY_ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + +#define DAXPY_ATOMIC_DATA_SETUP_OMP_TARGET \ + int hid = omp_get_initial_device(); \ + int did = omp_get_default_device(); \ +\ + allocAndInitOpenMPDeviceData(x, m_x, iend, did, hid); \ + allocAndInitOpenMPDeviceData(y, m_y, iend, did, hid); + +#define DAXPY_ATOMIC_DATA_TEARDOWN_OMP_TARGET \ + getOpenMPDeviceData(m_y, y, iend, hid, did); \ + deallocOpenMPDeviceData(x, did); \ + deallocOpenMPDeviceData(y, did); + + +void DAXPY_ATOMIC::runOpenMPTargetVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + DAXPY_ATOMIC_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + DAXPY_ATOMIC_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp target is_device_ptr(x, y) device( did ) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { + #pragma omp atomic + y[i] += a * x[i] ; + } + + } + stopTimer(); + + DAXPY_ATOMIC_DATA_TEARDOWN_OMP_TARGET; + + } else if ( vid == RAJA_OpenMPTarget ) { + + DAXPY_ATOMIC_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + DAXPY_ATOMIC_RAJA_BODY(RAJA::omp_atomic); + }); + + } + stopTimer(); + + DAXPY_ATOMIC_DATA_TEARDOWN_OMP_TARGET; + + } else { + getCout() << "\n DAXPY_ATOMIC : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic/DAXPY_ATOMIC-Seq.cpp b/src/basic/DAXPY_ATOMIC-Seq.cpp new file mode 100644 index 000000000..01f56305a --- /dev/null +++ b/src/basic/DAXPY_ATOMIC-Seq.cpp @@ -0,0 +1,93 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DAXPY_ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +void DAXPY_ATOMIC::runSeqVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + DAXPY_ATOMIC_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + DAXPY_ATOMIC_BODY; + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto daxpy_atomic_lam = [=](Index_type i) { + DAXPY_ATOMIC_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + daxpy_atomic_lam(i); + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + DAXPY_ATOMIC_RAJA_BODY(RAJA::seq_atomic); + }); + + } + stopTimer(); + + break; + } +#endif + + default : { + getCout() << "\n DAXPY_ATOMIC : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/DAXPY_ATOMIC.cpp b/src/basic/DAXPY_ATOMIC.cpp new file mode 100644 index 000000000..e5545ca8b --- /dev/null +++ b/src/basic/DAXPY_ATOMIC.cpp @@ -0,0 +1,80 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DAXPY_ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/DataUtils.hpp" + +namespace rajaperf +{ +namespace basic +{ + + +DAXPY_ATOMIC::DAXPY_ATOMIC(const RunParams& params) + : KernelBase(rajaperf::Basic_DAXPY_ATOMIC, params) +{ + setDefaultProblemSize(1000000); + setDefaultReps(500); + + setActualProblemSize( getTargetProblemSize() ); + + setItsPerRep( getActualProblemSize() ); + setKernelsPerRep(1); + setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() ); + setFLOPsPerRep(2 * getActualProblemSize()); + + setUsesFeature(Forall); + + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( Base_OpenMPTarget ); + setVariantDefined( RAJA_OpenMPTarget ); + + setVariantDefined( Base_CUDA ); + setVariantDefined( Lambda_CUDA ); + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( Base_HIP ); + setVariantDefined( Lambda_HIP ); + setVariantDefined( RAJA_HIP ); +} + +DAXPY_ATOMIC::~DAXPY_ATOMIC() +{ +} + +void DAXPY_ATOMIC::setUp(VariantID vid) +{ + allocAndInitDataConst(m_y, getActualProblemSize(), 0.0, vid); + allocAndInitData(m_x, getActualProblemSize(), vid); + initData(m_a); +} + +void DAXPY_ATOMIC::updateChecksum(VariantID vid) +{ + checksum[vid] += calcChecksum(m_y, getActualProblemSize()); +} + +void DAXPY_ATOMIC::tearDown(VariantID vid) +{ + (void) vid; + deallocData(m_x); + deallocData(m_y); +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/DAXPY_ATOMIC.hpp b/src/basic/DAXPY_ATOMIC.hpp new file mode 100644 index 000000000..b557812e4 --- /dev/null +++ b/src/basic/DAXPY_ATOMIC.hpp @@ -0,0 +1,68 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// DAXPY_ATOMIC kernel reference implementation: +/// +/// for (Index_type i = ibegin; i < iend; ++i ) { +/// y[i] += a * x[i] ; +/// } +/// + +#ifndef RAJAPerf_Basic_DAXPY_ATOMIC_HPP +#define RAJAPerf_Basic_DAXPY_ATOMIC_HPP + +#define DAXPY_ATOMIC_DATA_SETUP \ + Real_ptr x = m_x; \ + Real_ptr y = m_y; \ + Real_type a = m_a; + +#define DAXPY_ATOMIC_BODY \ + y[i] += a * x[i] ; + +#define DAXPY_ATOMIC_RAJA_BODY(policy) \ + RAJA::atomicAdd(&y[i], a * x[i]); + + +#include "common/KernelBase.hpp" + +namespace rajaperf +{ +class RunParams; + +namespace basic +{ + +class DAXPY_ATOMIC : public KernelBase +{ +public: + + DAXPY_ATOMIC(const RunParams& params); + + ~DAXPY_ATOMIC(); + + void setUp(VariantID vid); + void updateChecksum(VariantID vid); + void tearDown(VariantID vid); + + void runSeqVariant(VariantID vid); + void runOpenMPVariant(VariantID vid); + void runCudaVariant(VariantID vid); + void runHipVariant(VariantID vid); + void runOpenMPTargetVariant(VariantID vid); + +private: + Real_ptr m_x; + Real_ptr m_y; + Real_type m_a; +}; + +} // end namespace basic +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 62e1ef49f..84a85e311 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -18,6 +18,7 @@ // Basic kernels... // #include "basic/DAXPY.hpp" +#include "basic/DAXPY_ATOMIC.hpp" #include "basic/IF_QUAD.hpp" #include "basic/INIT3.hpp" #include "basic/INIT_VIEW1D.hpp" @@ -145,6 +146,7 @@ static const std::string KernelNames [] = // Basic kernels... // std::string("Basic_DAXPY"), + std::string("Basic_DAXPY_ATOMIC"), std::string("Basic_IF_QUAD"), std::string("Basic_INIT3"), std::string("Basic_INIT_VIEW1D"), @@ -440,6 +442,10 @@ KernelBase* getKernelObject(KernelID kid, kernel = new basic::DAXPY(run_params); break; } + case Basic_DAXPY_ATOMIC : { + kernel = new basic::DAXPY_ATOMIC(run_params); + break; + } case Basic_IF_QUAD : { kernel = new basic::IF_QUAD(run_params); break; diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index b54ec0358..e9391ad3c 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -71,6 +71,7 @@ enum KernelID { // Basic kernels... // Basic_DAXPY = 0, + Basic_DAXPY_ATOMIC, Basic_IF_QUAD, Basic_INIT3, Basic_INIT_VIEW1D, From 0f7a152399a7acc69afa6a5fbc601a74c09f284f Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 1 Feb 2022 16:15:53 -0800 Subject: [PATCH 177/392] Update various function names --- src/algorithm/SCAN-Cuda.cpp | 2 +- src/algorithm/SCAN-Hip.cpp | 2 +- src/algorithm/SCAN-OMP.cpp | 2 +- src/algorithm/SCAN-Seq.cpp | 2 +- src/algorithm/SCAN.cpp | 18 +++++++++--------- src/basic/INDEXLIST-Cuda.cpp | 2 +- src/basic/INDEXLIST-OMP.cpp | 2 +- src/basic/INDEXLIST-Seq.cpp | 2 +- src/basic/INDEXLIST.cpp | 16 ++++++++-------- src/basic/INDEXLIST_3LOOP-Cuda.cpp | 4 ++-- src/basic/INDEXLIST_3LOOP-Hip.cpp | 4 ++-- src/basic/INDEXLIST_3LOOP-OMP.cpp | 4 ++-- src/basic/INDEXLIST_3LOOP-Seq.cpp | 4 ++-- src/basic/INDEXLIST_3LOOP.cpp | 22 +++++++++++----------- 14 files changed, 43 insertions(+), 43 deletions(-) diff --git a/src/algorithm/SCAN-Cuda.cpp b/src/algorithm/SCAN-Cuda.cpp index 594a3bb40..a28f9dc26 100644 --- a/src/algorithm/SCAN-Cuda.cpp +++ b/src/algorithm/SCAN-Cuda.cpp @@ -44,7 +44,7 @@ void SCAN::runCudaVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); SCAN_DATA_SETUP; diff --git a/src/algorithm/SCAN-Hip.cpp b/src/algorithm/SCAN-Hip.cpp index 042ae723a..c8313b68a 100644 --- a/src/algorithm/SCAN-Hip.cpp +++ b/src/algorithm/SCAN-Hip.cpp @@ -49,7 +49,7 @@ void SCAN::runHipVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); SCAN_DATA_SETUP; diff --git a/src/algorithm/SCAN-OMP.cpp b/src/algorithm/SCAN-OMP.cpp index a8a934ae7..04f2dfff3 100644 --- a/src/algorithm/SCAN-OMP.cpp +++ b/src/algorithm/SCAN-OMP.cpp @@ -24,7 +24,7 @@ void SCAN::runOpenMPVariant(VariantID vid) const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); SCAN_DATA_SETUP; diff --git a/src/algorithm/SCAN-Seq.cpp b/src/algorithm/SCAN-Seq.cpp index 13657970f..38c546454 100644 --- a/src/algorithm/SCAN-Seq.cpp +++ b/src/algorithm/SCAN-Seq.cpp @@ -22,7 +22,7 @@ void SCAN::runSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); SCAN_DATA_SETUP; diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index aa4695fb0..066996f4f 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -21,15 +21,15 @@ namespace algorithm SCAN::SCAN(const RunParams& params) : KernelBase(rajaperf::Algorithm_SCAN, params) { - setDefaultSize(1000000); - setDefaultReps(100); + setDefaultProblemSize(1000000); + setDefaultReps(100); - setProblemSize( getRunSize() ); + setActualProblemSize( getTargetProblemSize() ); - setItsPerRep( getProblemSize() ); + setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getRunSize() ); - setFLOPsPerRep(1 * getRunSize()); + setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); + setFLOPsPerRep(1 * getActualProblemSize()); setUsesFeature(Scan); @@ -56,13 +56,13 @@ SCAN::~SCAN() void SCAN::setUp(VariantID vid) { - allocAndInitDataRandValue(m_x, getRunSize(), vid); - allocAndInitDataConst(m_y, getRunSize(), 0.0, vid); + allocAndInitDataRandValue(m_x, getActualProblemSize(), vid); + allocAndInitDataConst(m_y, getActualProblemSize(), 0.0, vid); } void SCAN::updateChecksum(VariantID vid) { - checksum[vid] += calcChecksum(m_y, getRunSize()); + checksum[vid] += calcChecksum(m_y, getActualProblemSize()); } void SCAN::tearDown(VariantID vid) diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index 8dae5f44e..8241226d4 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -181,7 +181,7 @@ void INDEXLIST::runCudaVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); INDEXLIST_DATA_SETUP; diff --git a/src/basic/INDEXLIST-OMP.cpp b/src/basic/INDEXLIST-OMP.cpp index c7b200e12..d29f549ef 100644 --- a/src/basic/INDEXLIST-OMP.cpp +++ b/src/basic/INDEXLIST-OMP.cpp @@ -23,7 +23,7 @@ void INDEXLIST::runOpenMPVariant(VariantID vid) const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); INDEXLIST_DATA_SETUP; diff --git a/src/basic/INDEXLIST-Seq.cpp b/src/basic/INDEXLIST-Seq.cpp index 90033d285..e895ab73e 100644 --- a/src/basic/INDEXLIST-Seq.cpp +++ b/src/basic/INDEXLIST-Seq.cpp @@ -22,7 +22,7 @@ void INDEXLIST::runSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); INDEXLIST_DATA_SETUP; diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index ac04deb64..5313c1e0d 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -21,16 +21,16 @@ namespace basic INDEXLIST::INDEXLIST(const RunParams& params) : KernelBase(rajaperf::Basic_INDEXLIST, params) { - setDefaultSize(1000000); + setDefaultProblemSize(1000000); setDefaultReps(100); - setProblemSize( getRunSize() ); + setActualProblemSize( getTargetProblemSize() ); - setItsPerRep( getProblemSize() ); + setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); setBytesPerRep( (1*sizeof(Index_type) + 1*sizeof(Index_type)) + - (1*sizeof(Int_type) + 0*sizeof(Int_type)) * getRunSize() / 2 + // about 50% output - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getRunSize() ); + (1*sizeof(Int_type) + 0*sizeof(Int_type)) * getActualProblemSize() / 2 + // about 50% output + (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); setFLOPsPerRep(0); setUsesFeature(Forall); @@ -54,14 +54,14 @@ INDEXLIST::~INDEXLIST() void INDEXLIST::setUp(VariantID vid) { - allocAndInitDataRandSign(m_x, getRunSize(), vid); - allocAndInitData(m_list, getRunSize(), vid); + allocAndInitDataRandSign(m_x, getActualProblemSize(), vid); + allocAndInitData(m_list, getActualProblemSize(), vid); m_len = -1; } void INDEXLIST::updateChecksum(VariantID vid) { - checksum[vid] += calcChecksum(m_list, getRunSize()); + checksum[vid] += calcChecksum(m_list, getActualProblemSize()); checksum[vid] += Checksum_type(m_len); } diff --git a/src/basic/INDEXLIST_3LOOP-Cuda.cpp b/src/basic/INDEXLIST_3LOOP-Cuda.cpp index c769eec4c..8c9f1120e 100644 --- a/src/basic/INDEXLIST_3LOOP-Cuda.cpp +++ b/src/basic/INDEXLIST_3LOOP-Cuda.cpp @@ -29,7 +29,7 @@ namespace basic #define INDEXLIST_3LOOP_DATA_SETUP_CUDA \ Index_type* counts; \ - allocCudaDeviceData(counts, getRunSize()+1); \ + allocCudaDeviceData(counts, getActualProblemSize()+1); \ allocAndInitCudaDeviceData(x, m_x, iend); \ allocAndInitCudaDeviceData(list, m_list, iend); @@ -70,7 +70,7 @@ void INDEXLIST_3LOOP::runCudaVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); INDEXLIST_3LOOP_DATA_SETUP; diff --git a/src/basic/INDEXLIST_3LOOP-Hip.cpp b/src/basic/INDEXLIST_3LOOP-Hip.cpp index 101089844..d44d36238 100644 --- a/src/basic/INDEXLIST_3LOOP-Hip.cpp +++ b/src/basic/INDEXLIST_3LOOP-Hip.cpp @@ -29,7 +29,7 @@ namespace basic #define INDEXLIST_3LOOP_DATA_SETUP_HIP \ Index_type* counts; \ - allocHipDeviceData(counts, getRunSize()+1); \ + allocHipDeviceData(counts, getActualProblemSize()+1); \ allocAndInitHipDeviceData(x, m_x, iend); \ allocAndInitHipDeviceData(list, m_list, iend); @@ -70,7 +70,7 @@ void INDEXLIST_3LOOP::runHipVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); INDEXLIST_3LOOP_DATA_SETUP; diff --git a/src/basic/INDEXLIST_3LOOP-OMP.cpp b/src/basic/INDEXLIST_3LOOP-OMP.cpp index 32a581b14..7598893b4 100644 --- a/src/basic/INDEXLIST_3LOOP-OMP.cpp +++ b/src/basic/INDEXLIST_3LOOP-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { #define INDEXLIST_3LOOP_DATA_SETUP_OMP \ - Index_type* counts = new Index_type[getRunSize()+1]; + Index_type* counts = new Index_type[getActualProblemSize()+1]; #define INDEXLIST_3LOOP_DATA_TEARDOWN_OMP \ delete[] counts; counts = nullptr; @@ -30,7 +30,7 @@ void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid) const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); INDEXLIST_3LOOP_DATA_SETUP; diff --git a/src/basic/INDEXLIST_3LOOP-Seq.cpp b/src/basic/INDEXLIST_3LOOP-Seq.cpp index ffe87ba57..cfdc9dac7 100644 --- a/src/basic/INDEXLIST_3LOOP-Seq.cpp +++ b/src/basic/INDEXLIST_3LOOP-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { #define INDEXLIST_3LOOP_DATA_SETUP_Seq \ - Index_type* counts = new Index_type[getRunSize()+1]; + Index_type* counts = new Index_type[getActualProblemSize()+1]; #define INDEXLIST_3LOOP_DATA_TEARDOWN_Seq \ delete[] counts; counts = nullptr; @@ -29,7 +29,7 @@ void INDEXLIST_3LOOP::runSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); INDEXLIST_3LOOP_DATA_SETUP; diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp index a89314754..fecd11cf8 100644 --- a/src/basic/INDEXLIST_3LOOP.cpp +++ b/src/basic/INDEXLIST_3LOOP.cpp @@ -21,21 +21,21 @@ namespace basic INDEXLIST_3LOOP::INDEXLIST_3LOOP(const RunParams& params) : KernelBase(rajaperf::Basic_INDEXLIST_3LOOP, params) { - setDefaultSize(1000000); + setDefaultProblemSize(1000000); setDefaultReps(100); - setProblemSize( getRunSize() ); + setActualProblemSize( getTargetProblemSize() ); - setItsPerRep( 3 * getProblemSize() + 1 ); + setItsPerRep( 3 * getActualProblemSize() + 1 ); setKernelsPerRep(3); - setBytesPerRep( (1*sizeof(Int_type) + 0*sizeof(Int_type)) * getRunSize() + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getRunSize() + + setBytesPerRep( (1*sizeof(Int_type) + 0*sizeof(Int_type)) * getActualProblemSize() + + (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() + (1*sizeof(Index_type) + 1*sizeof(Index_type)) + - (1*sizeof(Int_type) + 1*sizeof(Int_type)) * (getRunSize()+1) + + (1*sizeof(Int_type) + 1*sizeof(Int_type)) * (getActualProblemSize()+1) + - (0*sizeof(Int_type) + 1*sizeof(Int_type)) * (getRunSize()+1) + - (1*sizeof(Int_type) + 0*sizeof(Int_type)) * getRunSize() / 2 ); // about 50% output + (0*sizeof(Int_type) + 1*sizeof(Int_type)) * (getActualProblemSize()+1) + + (1*sizeof(Int_type) + 0*sizeof(Int_type)) * getActualProblemSize() / 2 ); // about 50% output setFLOPsPerRep(0); setUsesFeature(Forall); @@ -64,14 +64,14 @@ INDEXLIST_3LOOP::~INDEXLIST_3LOOP() void INDEXLIST_3LOOP::setUp(VariantID vid) { - allocAndInitDataRandSign(m_x, getRunSize(), vid); - allocAndInitData(m_list, getRunSize(), vid); + allocAndInitDataRandSign(m_x, getActualProblemSize(), vid); + allocAndInitData(m_list, getActualProblemSize(), vid); m_len = -1; } void INDEXLIST_3LOOP::updateChecksum(VariantID vid) { - checksum[vid] += calcChecksum(m_list, getRunSize()); + checksum[vid] += calcChecksum(m_list, getActualProblemSize()); checksum[vid] += Checksum_type(m_len); } From 1ce6a3def606e51f2efb9204b9a97fa1b5a101df Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 2 Feb 2022 09:19:43 -0800 Subject: [PATCH 178/392] Add config option to enable omp scans On current compilers omp scans are very slow or segfault --- CMakeLists.txt | 1 + src/algorithm/SCAN-OMP.cpp | 2 +- src/algorithm/SCAN.cpp | 2 +- src/basic/INDEXLIST-OMP.cpp | 2 +- src/basic/INDEXLIST.cpp | 2 +- src/basic/INDEXLIST_3LOOP-OMP.cpp | 2 +- src/basic/INDEXLIST_3LOOP.cpp | 2 +- src/rajaperf_config.hpp.in | 1 + 8 files changed, 8 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d3f1f69c..db860ba20 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,6 +35,7 @@ include(blt/SetupBLT.cmake) # cmake_dependent_option(RAJA_PERFSUITE_ENABLE_MPI "Build with MPI" On "ENABLE_MPI" Off) +cmake_dependent_option(RAJA_PERFSUITE_ENABLE_OPENMP_SCAN "Build OpenMP scan variants" Off "ENABLE_OPENMP" Off) # # Define RAJA settings... diff --git a/src/algorithm/SCAN-OMP.cpp b/src/algorithm/SCAN-OMP.cpp index 04f2dfff3..38fb09d89 100644 --- a/src/algorithm/SCAN-OMP.cpp +++ b/src/algorithm/SCAN-OMP.cpp @@ -30,7 +30,7 @@ void SCAN::runOpenMPVariant(VariantID vid) switch ( vid ) { -#if _OPENMP >= 201811 +#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP_SCAN) case Base_OpenMP : { startTimer(); diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index 066996f4f..145c4b302 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -37,7 +37,7 @@ SCAN::SCAN(const RunParams& params) setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); -#if defined(_OPENMP) && _OPENMP >= 201811 +#if defined(_OPENMP) && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP_SCAN) setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); #endif diff --git a/src/basic/INDEXLIST-OMP.cpp b/src/basic/INDEXLIST-OMP.cpp index d29f549ef..2f5b41461 100644 --- a/src/basic/INDEXLIST-OMP.cpp +++ b/src/basic/INDEXLIST-OMP.cpp @@ -29,7 +29,7 @@ void INDEXLIST::runOpenMPVariant(VariantID vid) switch ( vid ) { -#if _OPENMP >= 201811 +#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP_SCAN) case Base_OpenMP : { startTimer(); diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index 5313c1e0d..5e5ba32cc 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -40,7 +40,7 @@ INDEXLIST::INDEXLIST(const RunParams& params) setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); -#if defined(_OPENMP) && _OPENMP >= 201811 +#if defined(_OPENMP) && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP_SCAN) setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); #endif diff --git a/src/basic/INDEXLIST_3LOOP-OMP.cpp b/src/basic/INDEXLIST_3LOOP-OMP.cpp index 7598893b4..58762aa2e 100644 --- a/src/basic/INDEXLIST_3LOOP-OMP.cpp +++ b/src/basic/INDEXLIST_3LOOP-OMP.cpp @@ -36,7 +36,7 @@ void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid) switch ( vid ) { -#if _OPENMP >= 201811 +#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP_SCAN) case Base_OpenMP : { INDEXLIST_3LOOP_DATA_SETUP_OMP; diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp index fecd11cf8..3b5216392 100644 --- a/src/basic/INDEXLIST_3LOOP.cpp +++ b/src/basic/INDEXLIST_3LOOP.cpp @@ -45,7 +45,7 @@ INDEXLIST_3LOOP::INDEXLIST_3LOOP(const RunParams& params) setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); -#if defined(_OPENMP) && _OPENMP >= 201811 +#if defined(_OPENMP) && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP_SCAN) setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); #endif diff --git a/src/rajaperf_config.hpp.in b/src/rajaperf_config.hpp.in index 808993af4..76120c81a 100644 --- a/src/rajaperf_config.hpp.in +++ b/src/rajaperf_config.hpp.in @@ -25,6 +25,7 @@ #include #cmakedefine RAJA_PERFSUITE_ENABLE_MPI +#cmakedefine RAJA_PERFSUITE_ENABLE_OPENMP_SCAN namespace rajaperf { From 64e2c10eb344c830a54711d0967741366b7e5ebc Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 2 Feb 2022 16:57:21 -0800 Subject: [PATCH 179/392] add INDEXLIST_3LOOP warmup --- src/common/Executor.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 6980ba082..87d9c8438 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -19,6 +19,7 @@ // Warmup kernels to run first to help reduce startup overheads in timings #include "basic/DAXPY.hpp" #include "basic/REDUCE3_INT.hpp" +#include "basic/INDEXLIST_3LOOP.hpp" #include "algorithm/SORT.hpp" #include @@ -756,6 +757,7 @@ void Executor::runSuite() warmup_kernels.push_back(new basic::DAXPY(run_params)); warmup_kernels.push_back(new basic::REDUCE3_INT(run_params)); + warmup_kernels.push_back(new basic::INDEXLIST_3LOOP(run_params)); warmup_kernels.push_back(new algorithm::SORT(run_params)); for (size_t ik = 0; ik < warmup_kernels.size(); ++ik) { From 1a72c0fa5e484ea093eba4b08a2fe7f8748c588f Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 3 Feb 2022 10:56:23 -0800 Subject: [PATCH 180/392] Implement looking back at multiple blocks in INDEXLIST --- src/basic/INDEXLIST-Cuda.cpp | 142 ++++++++++++++++++++++++++--------- 1 file changed, 108 insertions(+), 34 deletions(-) diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index 8241226d4..3f5dee195 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -25,6 +25,7 @@ namespace basic // Define thread block size for CUDA execution // const size_t block_size = 256; + const size_t warp_size = 32; #define INDEXLIST_DATA_SETUP_CUDA \ @@ -41,6 +42,29 @@ struct pair Index_type first, second; }; + +// perform a warp scan on inc and return the inclusive result at each thread +__device__ Index_type warp_scan_inclusive(const Index_type inc) +{ + const int warp_index = (threadIdx.x % warp_size); + + Index_type val = inc; + + // NOTE: only works for powers of 2 + for ( int i = 1; i < warp_size; i *= 2 ) { + const bool participate = warp_index & i; + const int prior_id = (warp_index & ~(i-1)) - 1; + const Index_type prior_val = __shfl_sync(0xffffffffu, val, prior_id); + if ( participate ) { + val = prior_val + val; + } + } + + return val; +} + +// perform a block scan on inc and return the result at each thread +// pair.first is the exclusive result and pair.second is the inclusive result __device__ pair block_scan(const Index_type inc) { extern __shared__ volatile Index_type s_thread_counts[ ]; @@ -49,9 +73,10 @@ __device__ pair block_scan(const Index_type inc) s_thread_counts[ threadIdx.x ] = val; __syncthreads(); + // NOTE: only works for powers of 2 for ( int i = 1; i < blockDim.x; i *= 2 ) { const bool participate = threadIdx.x & i; - const int prior_id = threadIdx.x & ~(i-1) - 1; + const int prior_id = (threadIdx.x & ~(i-1)) - 1; if ( participate ) { val = s_thread_counts[ prior_id ] + s_thread_counts[ threadIdx.x ]; s_thread_counts[ threadIdx.x ] = val; @@ -65,6 +90,8 @@ __device__ pair block_scan(const Index_type inc) return pair { prior_val, val }; } +// perform a grid scan on inc and return the result at each thread +// pair.first is the exclusive result and pair.second is the inclusive result __device__ pair grid_scan(const int block_id, const Index_type inc, Index_type* block_counts, @@ -72,62 +99,109 @@ __device__ pair grid_scan(const int block_id, unsigned* block_readys) { const bool first_block = (block_id == 0); - const bool last_block = (block_id+1 == gridDim.x); - const bool first_thread = (threadIdx.x == 0); - const bool last_thread = (threadIdx.x+1 == blockDim.x); + const bool last_block = (block_id == gridDim.x-1); + const bool last_thread = (threadIdx.x == blockDim.x-1); + const bool last_warp = (threadIdx.x >= blockDim.x - warp_size); + const int warp_index = (threadIdx.x % warp_size); + const int warp_index_mask = (1u << warp_index); + const int warp_index_mask_right = warp_index_mask | (warp_index_mask - 1); pair count = block_scan(inc); - if (last_thread) { - if (first_block) { + if (first_block) { + + if (!last_block && last_thread) { block_counts[block_id] = count.second; // write inclusive scan result for block grid_counts[block_id] = count.second; // write inclusive scan result for grid through block __threadfence(); // ensure block_counts, grid_counts ready (release) atomicExch(&block_readys[block_id], 2u); // write block_counts, grid_counts are ready - } else { + } + + } else { + + if (!last_block && last_thread) { block_counts[block_id] = count.second; // write inclusive scan result for block __threadfence(); // ensure block_counts ready (release) atomicExch(&block_readys[block_id], 1u); // write block_counts is ready } - } - if (!first_block) { - // extern __shared__ volatile int s_block_readys[ ]; // reusing shared memory + __shared__ volatile Index_type s_prev_grid_count; - // const int num_participating_threads = (block_id <= blockDim.x) ? block_id : blockDim.x; + // get prev_grid_count using last warp in block + // if (last_warp) { - // int prior_block_ready = 0; - // if (threadIdx.x < num_participating_threads) { - // prior_block_ready = block_readys[block_id-1 - threadIdx.x]; - // } - // s_block_readys[threadIdx.x] = prior_block_ready; - // __syncthreads(); + // Index_type prev_block_count = 0; - __shared__ volatile Index_type s_prev_block_count; + // const int prev_block_id = block_id-warp_size+warp_index; - if (first_thread) { - while (atomicCAS(&block_readys[block_id-1], 11u, 11u) != 2u); // check if block_counts is ready - __threadfence(); // ensure block_counts ready (acquire) - s_prev_block_count = grid_counts[block_id-1]; - } - __syncthreads(); + // unsigned prev_block_ready = (prev_block_id >= 0) ? 0u : 1u; + // unsigned prev_blocks_ready_ballot = 0u; + // unsigned prev_grids_ready_ballot = 0u; + + // // ensure previous block_counts are ready and at least one grid_count is ready + // do { + // if (prev_block_id >= 0 && prev_block_ready != 2u) { + // prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); + // } + + // prev_blocks_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready >= 1u); + // prev_grids_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready == 2u); + + // } while (prev_blocks_ready_ballot != 0xffffffffu || prev_grids_ready_ballot == 0u); + // __threadfence(); // ensure block_counts or grid_counts ready (acquire) + + // // read one grid_count from a block with id grid_count_ready_id + // // and read the block_counts from blocks with higher ids. + // if (warp_index_mask > prev_grids_ready_ballot) { + // // get block_counts for prev_block_ids in (grid_count_ready_id, block_id) + // prev_block_count = block_counts[prev_block_id]; + // // if (last_block) printf("block %i, block_counts[%i], %li, %llu\n", block_id, prev_block_id, (long)prev_block_count, device_timer()); + // } else if (prev_grids_ready_ballot == (prev_grids_ready_ballot & warp_index_mask_right)) { + // // get grid_count for grid_count_ready_id + // prev_block_count = grid_counts[prev_block_id]; + // // if (last_block) printf("block %i, %u, grid_counts[%i], %li, %llu\n", block_id, warp_index_mask_right, prev_block_id, (long)prev_block_count, device_timer()); + // } + + // Index_type prev_grid_count = warp_scan_inclusive(prev_block_count); + + // if (last_thread) { - Index_type prev_block_count = s_prev_block_count; + // if (!last_block) { + // grid_counts[block_id] = prev_grid_count + count.second; // write inclusive scan result for grid through block + // __threadfence(); // ensure grid_counts ready (release) + // atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready + // } - count.first = prev_block_count + count.first; - count.second = prev_block_count + count.second; + // s_prev_grid_count = prev_grid_count; + // // printf("block %i, %li, %u, %llu\n", block_id, (long)prev_grid_count, prev_grids_ready_ballot, device_timer()); + // } + // } + // get prev_grid_count using last thread in block if (last_thread) { - grid_counts[block_id] = count.second; // write inclusive scan result for grid through block - __threadfence(); // ensure block_counts ready (release) - atomicExch(&block_readys[block_id], 2u); // write block_counts is ready + while (atomicCAS(&block_readys[block_id-1], 11u, 11u) != 2u); // check if block_counts is ready + __threadfence(); // ensure block_counts ready (acquire) + Index_type prev_grid_count = grid_counts[block_id-1]; + + if (!last_block) { + grid_counts[block_id] = prev_grid_count + count.second; // write inclusive scan result for grid through block + __threadfence(); // ensure grid_counts ready (release) + atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready + } + + s_prev_grid_count = prev_grid_count; } + __syncthreads(); - } + Index_type prev_grid_count = s_prev_grid_count; + + count.first = prev_grid_count + count.first; + count.second = prev_grid_count + count.second; - if (last_block) { - for (int i = threadIdx.x; i < gridDim.x; ++i) { - block_readys[i] = 0u; // last block resets readys to 0 (for next kernel to reuse) + if (last_block) { + for (int i = threadIdx.x; i < gridDim.x-1; i += blockDim.x) { + while (atomicCAS(&block_readys[i], 2u, 0u) != 2u); + } } } From 05760c842af8f73abfb90361a1044be77c667115 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 3 Feb 2022 10:59:30 -0800 Subject: [PATCH 181/392] Add device_timer to Cuda utils --- src/common/CudaDataUtils.hpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index 0467d0f19..e12ba3b65 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -25,6 +25,19 @@ namespace rajaperf { +/*! + * \brief Device timer, returns a time in ns from an arbitrary starting point. + * Note that this time is consistent across the whole device. + */ +__device__ __forceinline__ unsigned long long device_timer() +{ + unsigned long long global_timer = 0; +#if __CUDA_ARCH__ >= 300 + asm volatile ("mov.u64 %0, %globaltimer;" : "=l"(global_timer)); +#endif + return global_timer; +} + /*! * \brief Simple forall cuda kernel that runs a lambda. */ From af5084c44b17289d97791151ec406965ab871b17 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 3 Feb 2022 11:17:09 -0800 Subject: [PATCH 182/392] Use scan that checks multiple previous blocks --- src/basic/INDEXLIST-Cuda.cpp | 102 +++++++++++++++-------------------- 1 file changed, 42 insertions(+), 60 deletions(-) diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index 3f5dee195..9a8123685 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -128,68 +128,50 @@ __device__ pair grid_scan(const int block_id, __shared__ volatile Index_type s_prev_grid_count; // get prev_grid_count using last warp in block - // if (last_warp) { - - // Index_type prev_block_count = 0; - - // const int prev_block_id = block_id-warp_size+warp_index; - - // unsigned prev_block_ready = (prev_block_id >= 0) ? 0u : 1u; - // unsigned prev_blocks_ready_ballot = 0u; - // unsigned prev_grids_ready_ballot = 0u; - - // // ensure previous block_counts are ready and at least one grid_count is ready - // do { - // if (prev_block_id >= 0 && prev_block_ready != 2u) { - // prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); - // } - - // prev_blocks_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready >= 1u); - // prev_grids_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready == 2u); - - // } while (prev_blocks_ready_ballot != 0xffffffffu || prev_grids_ready_ballot == 0u); - // __threadfence(); // ensure block_counts or grid_counts ready (acquire) - - // // read one grid_count from a block with id grid_count_ready_id - // // and read the block_counts from blocks with higher ids. - // if (warp_index_mask > prev_grids_ready_ballot) { - // // get block_counts for prev_block_ids in (grid_count_ready_id, block_id) - // prev_block_count = block_counts[prev_block_id]; - // // if (last_block) printf("block %i, block_counts[%i], %li, %llu\n", block_id, prev_block_id, (long)prev_block_count, device_timer()); - // } else if (prev_grids_ready_ballot == (prev_grids_ready_ballot & warp_index_mask_right)) { - // // get grid_count for grid_count_ready_id - // prev_block_count = grid_counts[prev_block_id]; - // // if (last_block) printf("block %i, %u, grid_counts[%i], %li, %llu\n", block_id, warp_index_mask_right, prev_block_id, (long)prev_block_count, device_timer()); - // } - - // Index_type prev_grid_count = warp_scan_inclusive(prev_block_count); - - // if (last_thread) { - - // if (!last_block) { - // grid_counts[block_id] = prev_grid_count + count.second; // write inclusive scan result for grid through block - // __threadfence(); // ensure grid_counts ready (release) - // atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready - // } - - // s_prev_grid_count = prev_grid_count; - // // printf("block %i, %li, %u, %llu\n", block_id, (long)prev_grid_count, prev_grids_ready_ballot, device_timer()); - // } - // } - - // get prev_grid_count using last thread in block - if (last_thread) { - while (atomicCAS(&block_readys[block_id-1], 11u, 11u) != 2u); // check if block_counts is ready - __threadfence(); // ensure block_counts ready (acquire) - Index_type prev_grid_count = grid_counts[block_id-1]; - - if (!last_block) { - grid_counts[block_id] = prev_grid_count + count.second; // write inclusive scan result for grid through block - __threadfence(); // ensure grid_counts ready (release) - atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready + if (last_warp) { + + Index_type prev_block_count = 0; + + const int prev_block_id = block_id-warp_size+warp_index; + + unsigned prev_block_ready = (prev_block_id >= 0) ? 0u : 1u; + unsigned prev_blocks_ready_ballot = 0u; + unsigned prev_grids_ready_ballot = 0u; + + // ensure previous block_counts are ready and at least one grid_count is ready + do { + if (prev_block_id >= 0 && prev_block_ready != 2u) { + prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); + } + + prev_blocks_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready >= 1u); + prev_grids_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready == 2u); + + } while (prev_blocks_ready_ballot != 0xffffffffu || prev_grids_ready_ballot == 0u); + __threadfence(); // ensure block_counts or grid_counts ready (acquire) + + // read one grid_count from a block with id grid_count_ready_id + // and read the block_counts from blocks with higher ids. + if (warp_index_mask > prev_grids_ready_ballot) { + // get block_counts for prev_block_ids in (grid_count_ready_id, block_id) + prev_block_count = block_counts[prev_block_id]; + } else if (prev_grids_ready_ballot == (prev_grids_ready_ballot & warp_index_mask_right)) { + // get grid_count for grid_count_ready_id + prev_block_count = grid_counts[prev_block_id]; } - s_prev_grid_count = prev_grid_count; + Index_type prev_grid_count = warp_scan_inclusive(prev_block_count); + + if (last_thread) { + + if (!last_block) { + grid_counts[block_id] = prev_grid_count + count.second; // write inclusive scan result for grid through block + __threadfence(); // ensure grid_counts ready (release) + atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready + } + + s_prev_grid_count = prev_grid_count; + } } __syncthreads(); From 773fdb8a8c582fa01b7f51e015e4f778925c382a Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 3 Feb 2022 14:37:02 -0800 Subject: [PATCH 183/392] Use cub warp reduce instead of hand rolled warp scan --- src/basic/INDEXLIST-Cuda.cpp | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index 9a8123685..98d3338d3 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -14,6 +14,8 @@ #include "common/CudaDataUtils.hpp" +#include + #include namespace rajaperf @@ -43,26 +45,6 @@ struct pair }; -// perform a warp scan on inc and return the inclusive result at each thread -__device__ Index_type warp_scan_inclusive(const Index_type inc) -{ - const int warp_index = (threadIdx.x % warp_size); - - Index_type val = inc; - - // NOTE: only works for powers of 2 - for ( int i = 1; i < warp_size; i *= 2 ) { - const bool participate = warp_index & i; - const int prior_id = (warp_index & ~(i-1)) - 1; - const Index_type prior_val = __shfl_sync(0xffffffffu, val, prior_id); - if ( participate ) { - val = prior_val + val; - } - } - - return val; -} - // perform a block scan on inc and return the result at each thread // pair.first is the exclusive result and pair.second is the inclusive result __device__ pair block_scan(const Index_type inc) @@ -160,7 +142,11 @@ __device__ pair grid_scan(const int block_id, prev_block_count = grid_counts[prev_block_id]; } - Index_type prev_grid_count = warp_scan_inclusive(prev_block_count); + using WarpReduce = cub::WarpReduce; + __shared__ typename WarpReduce::TempStorage s_temp_warp_storage; + + Index_type prev_grid_count = WarpReduce(s_temp_warp_storage).Sum(prev_block_count); + prev_grid_count = __shfl_sync(0xffffffffu, prev_grid_count, 0, warp_size); // broadcast output to all threads in warp if (last_thread) { From af0e5b076d2482be42ab955cdd947a807f1f6cd2 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 3 Feb 2022 14:55:37 -0800 Subject: [PATCH 184/392] Use cub block scan instead of hand rolled version --- src/basic/INDEXLIST-Cuda.cpp | 46 +++++++++++------------------------- 1 file changed, 14 insertions(+), 32 deletions(-) diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index 98d3338d3..7cc282320 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -14,6 +14,7 @@ #include "common/CudaDataUtils.hpp" +#include #include #include @@ -44,34 +45,6 @@ struct pair Index_type first, second; }; - -// perform a block scan on inc and return the result at each thread -// pair.first is the exclusive result and pair.second is the inclusive result -__device__ pair block_scan(const Index_type inc) -{ - extern __shared__ volatile Index_type s_thread_counts[ ]; - - Index_type val = inc; - s_thread_counts[ threadIdx.x ] = val; - __syncthreads(); - - // NOTE: only works for powers of 2 - for ( int i = 1; i < blockDim.x; i *= 2 ) { - const bool participate = threadIdx.x & i; - const int prior_id = (threadIdx.x & ~(i-1)) - 1; - if ( participate ) { - val = s_thread_counts[ prior_id ] + s_thread_counts[ threadIdx.x ]; - s_thread_counts[ threadIdx.x ] = val; - } - __syncthreads(); - } - - Index_type prior_val = (threadIdx.x > 0) ? s_thread_counts[threadIdx.x-1] : 0; - __syncthreads(); - - return pair { prior_val, val }; -} - // perform a grid scan on inc and return the result at each thread // pair.first is the exclusive result and pair.second is the inclusive result __device__ pair grid_scan(const int block_id, @@ -88,7 +61,18 @@ __device__ pair grid_scan(const int block_id, const int warp_index_mask = (1u << warp_index); const int warp_index_mask_right = warp_index_mask | (warp_index_mask - 1); - pair count = block_scan(inc); + using BlockScan = cub::BlockScan; + using WarpReduce = cub::WarpReduce; + + union SharedStorage { + typename BlockScan::TempStorage block_scan_storage; + typename WarpReduce::TempStorage warp_reduce_storage; + }; + __shared__ SharedStorage s_temp_storage; + + pair count; + BlockScan(s_temp_storage.block_scan_storage).ExclusiveSum(inc, count.first); + count.second = count.first + inc; if (first_block) { @@ -142,10 +126,8 @@ __device__ pair grid_scan(const int block_id, prev_block_count = grid_counts[prev_block_id]; } - using WarpReduce = cub::WarpReduce; - __shared__ typename WarpReduce::TempStorage s_temp_warp_storage; - Index_type prev_grid_count = WarpReduce(s_temp_warp_storage).Sum(prev_block_count); + Index_type prev_grid_count = WarpReduce(s_temp_storage.warp_reduce_storage).Sum(prev_block_count); prev_grid_count = __shfl_sync(0xffffffffu, prev_grid_count, 0, warp_size); // broadcast output to all threads in warp if (last_thread) { From d2d5cc4c14307beaf2b78388f33e07f37b63f204 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 4 Feb 2022 10:54:02 -0800 Subject: [PATCH 185/392] Set checksum factor in SCAN kernel --- src/algorithm/SCAN.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index 145c4b302..329f17548 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -31,6 +31,12 @@ SCAN::SCAN(const RunParams& params) setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); setFLOPsPerRep(1 * getActualProblemSize()); + Checksum_type actualProblemSize = getActualProblemSize(); + checksum_scale_factor = 1e-2 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() ) / + ( actualProblemSize * (actualProblemSize + 1) / 2 ); + setUsesFeature(Scan); setVariantDefined( Base_Seq ); @@ -62,7 +68,7 @@ void SCAN::setUp(VariantID vid) void SCAN::updateChecksum(VariantID vid) { - checksum[vid] += calcChecksum(m_y, getActualProblemSize()); + checksum[vid] += calcChecksum(m_y, getActualProblemSize(), checksum_scale_factor); } void SCAN::tearDown(VariantID vid) From 7c28f93c23f66c6620f62101b8b1bcf4fe4e0810 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 8 Feb 2022 08:02:46 -0800 Subject: [PATCH 186/392] Use non-atomic block ordering --- src/basic/INDEXLIST-Cuda.cpp | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index 7cc282320..228acbfd9 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -158,28 +158,16 @@ __device__ pair grid_scan(const int block_id, return count; } -__device__ int get_block_id(unsigned* block_id_inc) -{ - __shared__ volatile unsigned s_block_id; - if (threadIdx.x == 0) { - s_block_id = atomicInc(block_id_inc, gridDim.x-1); - } - __syncthreads(); - unsigned block_id = s_block_id; - __syncthreads(); - return static_cast(block_id); -} - __global__ void indexlist(Real_ptr x, Int_ptr list, Index_type* block_counts, Index_type* grid_counts, unsigned* block_readys, - unsigned* block_id_inc, Index_type* len, Index_type iend) { - const int block_id = get_block_id(block_id_inc); + // blocks do run in order in cuda an hip (this can be replaced with an atomic) + const int block_id = blockIdx.x; Index_type i = block_id * blockDim.x + threadIdx.x; Index_type inc = 0; @@ -224,16 +212,13 @@ void INDEXLIST::runCudaVariant(VariantID vid) unsigned* block_readys; allocCudaDeviceData(block_readys, grid_size); cudaErrchk( cudaMemset(block_readys, 0, sizeof(unsigned)*grid_size) ); - unsigned* block_id_inc; - allocCudaDeviceData(block_id_inc, grid_size); - cudaErrchk( cudaMemset(block_id_inc, 0, sizeof(unsigned)) ); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { indexlist<<>>( x+ibegin, list+ibegin, - block_counts, grid_counts, block_readys, block_id_inc, + block_counts, grid_counts, block_readys, len, iend-ibegin ); cudaErrchk( cudaGetLastError() ); @@ -247,7 +232,6 @@ void INDEXLIST::runCudaVariant(VariantID vid) deallocCudaDeviceData(block_counts); deallocCudaDeviceData(grid_counts); deallocCudaDeviceData(block_readys); - deallocCudaDeviceData(block_id_inc); INDEXLIST_DATA_TEARDOWN_CUDA; From bd1a22be2db3bfeb73f1da2d5bc498f077ca7040 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 8 Feb 2022 08:06:58 -0800 Subject: [PATCH 187/392] template on block size and items per thread --- src/basic/INDEXLIST-Cuda.cpp | 97 +++++++++++++++++++++++------------- 1 file changed, 61 insertions(+), 36 deletions(-) diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index 228acbfd9..0dadb1ee2 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -16,6 +16,7 @@ #include #include +#include #include @@ -29,6 +30,7 @@ namespace basic // const size_t block_size = 256; const size_t warp_size = 32; + const size_t items_per_thread = 15; #define INDEXLIST_DATA_SETUP_CUDA \ @@ -40,54 +42,67 @@ namespace basic deallocCudaDeviceData(x); \ deallocCudaDeviceData(list); -struct pair -{ - Index_type first, second; -}; -// perform a grid scan on inc and return the result at each thread -// pair.first is the exclusive result and pair.second is the inclusive result -__device__ pair grid_scan(const int block_id, - const Index_type inc, +// perform a grid scan on val and returns the result at each thread +// in exclusive and inclusive, note that val is used as scratch space +template < size_t block_size, size_t items_per_thread > +__device__ void grid_scan(const int block_id, + Index_type (&val)[items_per_thread], + Index_type (&exclusive)[items_per_thread], + Index_type (&inclusive)[items_per_thread], Index_type* block_counts, Index_type* grid_counts, unsigned* block_readys) { const bool first_block = (block_id == 0); const bool last_block = (block_id == gridDim.x-1); - const bool last_thread = (threadIdx.x == blockDim.x-1); - const bool last_warp = (threadIdx.x >= blockDim.x - warp_size); + const bool last_thread = (threadIdx.x == block_size-1); + const bool last_warp = (threadIdx.x >= block_size - warp_size); const int warp_index = (threadIdx.x % warp_size); const int warp_index_mask = (1u << warp_index); const int warp_index_mask_right = warp_index_mask | (warp_index_mask - 1); - using BlockScan = cub::BlockScan; + using BlockScan = cub::BlockScan; //, cub::BLOCK_SCAN_WARP_SCANS>; + using BlockExchange = cub::BlockExchange; using WarpReduce = cub::WarpReduce; union SharedStorage { typename BlockScan::TempStorage block_scan_storage; + typename BlockExchange::TempStorage block_exchange_storage; typename WarpReduce::TempStorage warp_reduce_storage; }; __shared__ SharedStorage s_temp_storage; - pair count; - BlockScan(s_temp_storage.block_scan_storage).ExclusiveSum(inc, count.first); - count.second = count.first + inc; + BlockExchange(s_temp_storage.block_exchange_storage).StripedToBlocked(val); + __syncthreads(); + + + BlockScan(s_temp_storage.block_scan_storage).ExclusiveSum(val, exclusive); + __syncthreads(); + + for (int ti = 0; ti < items_per_thread; ++ti) { + inclusive[ti] = exclusive[ti] + val[ti]; + } + + BlockExchange(s_temp_storage.block_exchange_storage).BlockedToStriped(exclusive); + __syncthreads(); + BlockExchange(s_temp_storage.block_exchange_storage).BlockedToStriped(inclusive); + __syncthreads(); if (first_block) { if (!last_block && last_thread) { - block_counts[block_id] = count.second; // write inclusive scan result for block - grid_counts[block_id] = count.second; // write inclusive scan result for grid through block - __threadfence(); // ensure block_counts, grid_counts ready (release) + block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block + grid_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for grid through block + __threadfence(); // ensure block_counts, grid_counts ready (release) atomicExch(&block_readys[block_id], 2u); // write block_counts, grid_counts are ready } } else { if (!last_block && last_thread) { - block_counts[block_id] = count.second; // write inclusive scan result for block - __threadfence(); // ensure block_counts ready (release) + block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block + __threadfence(); // ensure block_counts ready (release) atomicExch(&block_readys[block_id], 1u); // write block_counts is ready } @@ -133,7 +148,7 @@ __device__ pair grid_scan(const int block_id, if (last_thread) { if (!last_block) { - grid_counts[block_id] = prev_grid_count + count.second; // write inclusive scan result for grid through block + grid_counts[block_id] = prev_grid_count + inclusive[items_per_thread-1]; // write inclusive scan result for grid through block __threadfence(); // ensure grid_counts ready (release) atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready } @@ -145,19 +160,20 @@ __device__ pair grid_scan(const int block_id, __syncthreads(); Index_type prev_grid_count = s_prev_grid_count; - count.first = prev_grid_count + count.first; - count.second = prev_grid_count + count.second; + for (int ti = 0; ti < items_per_thread; ++ti) { + exclusive[ti] = prev_grid_count + exclusive[ti]; + inclusive[ti] = prev_grid_count + inclusive[ti]; + } if (last_block) { - for (int i = threadIdx.x; i < gridDim.x-1; i += blockDim.x) { + for (int i = threadIdx.x; i < gridDim.x-1; i += block_size) { while (atomicCAS(&block_readys[i], 2u, 0u) != 2u); } } } - - return count; } +template < size_t block_size, size_t items_per_thread > __global__ void indexlist(Real_ptr x, Int_ptr list, Index_type* block_counts, @@ -169,22 +185,29 @@ __global__ void indexlist(Real_ptr x, // blocks do run in order in cuda an hip (this can be replaced with an atomic) const int block_id = blockIdx.x; - Index_type i = block_id * blockDim.x + threadIdx.x; - Index_type inc = 0; - if (i < iend) { + Index_type val[items_per_thread] = { 0 }; + + for (Index_type ti = 0, i = block_id * block_size * items_per_thread + threadIdx.x; + ti < items_per_thread && i < iend; + ++ti, i += block_size) { if (INDEXLIST_CONDITIONAL) { - inc = 1; + val[ti] = 1; } } - pair count = grid_scan(block_id, inc, block_counts, grid_counts, block_readys); + Index_type exclusive[items_per_thread]; + Index_type inclusive[items_per_thread]; + grid_scan( + block_id, val, exclusive, inclusive, block_counts, grid_counts, block_readys); - if (i < iend) { - if (count.first != count.second) { - list[count.first] = i; + for (Index_type ti = 0, i = block_id * block_size * items_per_thread + threadIdx.x; + ti < items_per_thread && i < iend; + ++ti, i += block_size) { + if (exclusive[ti] != inclusive[ti]) { + list[exclusive[ti]] = i; } if (i == iend-1) { - *len = count.second; + *len = inclusive[ti]; } } } @@ -201,7 +224,8 @@ void INDEXLIST::runCudaVariant(VariantID vid) INDEXLIST_DATA_SETUP_CUDA; - const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size); + const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*items_per_thread); + const size_t shmem_size = 0; Index_type* len; allocCudaPinnedData(len, 1); @@ -216,7 +240,8 @@ void INDEXLIST::runCudaVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - indexlist<<>>( + indexlist + <<>>( x+ibegin, list+ibegin, block_counts, grid_counts, block_readys, len, iend-ibegin ); From b5b93668494f87f62990a38fc37e4a2aa7762e46 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 8 Feb 2022 08:07:16 -0800 Subject: [PATCH 188/392] add warp reduce header --- src/basic/INDEXLIST-Cuda.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index 0dadb1ee2..8d8dcf07b 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -15,6 +15,7 @@ #include "common/CudaDataUtils.hpp" #include +#include #include #include From 64059c203c8d11dccd72e5371ab26540ec544908 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 8 Feb 2022 08:10:07 -0800 Subject: [PATCH 189/392] reorder includes --- src/basic/INDEXLIST-Cuda.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index 8d8dcf07b..d847081d6 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -15,9 +15,9 @@ #include "common/CudaDataUtils.hpp" #include +#include #include #include -#include #include From 284a767e92fdaa2fdbf15f4c85d84be09fd64f29 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 8 Feb 2022 08:16:01 -0800 Subject: [PATCH 190/392] Loop to get more previous block data Previously would get at most 1 warp's worth --- src/basic/INDEXLIST-Cuda.cpp | 62 ++++++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 13 deletions(-) diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index d847081d6..8dd21f97a 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -112,38 +112,74 @@ __device__ void grid_scan(const int block_id, // get prev_grid_count using last warp in block if (last_warp) { - Index_type prev_block_count = 0; + Index_type prev_grid_count = 0; - const int prev_block_id = block_id-warp_size+warp_index; + // accumulate previous block counts into registers of warp - unsigned prev_block_ready = (prev_block_id >= 0) ? 0u : 1u; + int prev_block_base_id = block_id - warp_size; + + unsigned prev_block_ready = 0u; unsigned prev_blocks_ready_ballot = 0u; unsigned prev_grids_ready_ballot = 0u; - // ensure previous block_counts are ready and at least one grid_count is ready - do { - if (prev_block_id >= 0 && prev_block_ready != 2u) { + // accumulate full warp worths of block counts + // stop if run out of full warps of a grid count is ready + while (prev_block_base_id >= 0) { + + const int prev_block_id = prev_block_base_id + warp_index; + + // ensure previous block_counts are ready + do { + prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); + + prev_blocks_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready >= 1u); + + } while (prev_blocks_ready_ballot != 0xffffffffu); + + prev_grids_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready == 2u); + + if (prev_grids_ready_ballot != 0u) { + break; + } + + __threadfence(); // ensure block_counts or grid_counts ready (acquire) + + // accumulate block_counts for prev_block_id + prev_grid_count += block_counts[prev_block_id]; + + prev_block_ready = 0u; + + prev_block_base_id -= warp_size; + } + + const int prev_block_id = prev_block_base_id + warp_index; + + // ensure previous block_counts are ready + // this checks that block counts is ready for all blocks above + // the highest grid count that is ready + while (~prev_blocks_ready_ballot >= prev_grids_ready_ballot) { + + if (prev_block_id >= 0) { prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); } prev_blocks_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready >= 1u); prev_grids_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready == 2u); - - } while (prev_blocks_ready_ballot != 0xffffffffu || prev_grids_ready_ballot == 0u); + } __threadfence(); // ensure block_counts or grid_counts ready (acquire) // read one grid_count from a block with id grid_count_ready_id // and read the block_counts from blocks with higher ids. if (warp_index_mask > prev_grids_ready_ballot) { - // get block_counts for prev_block_ids in (grid_count_ready_id, block_id) - prev_block_count = block_counts[prev_block_id]; + // accumulate block_counts for prev_block_id + prev_grid_count += block_counts[prev_block_id]; } else if (prev_grids_ready_ballot == (prev_grids_ready_ballot & warp_index_mask_right)) { - // get grid_count for grid_count_ready_id - prev_block_count = grid_counts[prev_block_id]; + // accumulate grid_count for grid_count_ready_id + prev_grid_count += grid_counts[prev_block_id]; } - Index_type prev_grid_count = WarpReduce(s_temp_storage.warp_reduce_storage).Sum(prev_block_count); + prev_grid_count = WarpReduce(s_temp_storage.warp_reduce_storage).Sum(prev_grid_count); prev_grid_count = __shfl_sync(0xffffffffu, prev_grid_count, 0, warp_size); // broadcast output to all threads in warp if (last_thread) { From 716e2eb44bc5dc85c7e27b42dbe191a6336b3c45 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 8 Feb 2022 08:16:25 -0800 Subject: [PATCH 191/392] Update get block_id comment --- src/basic/INDEXLIST-Cuda.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index 8dd21f97a..c3e2fc770 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -219,7 +219,9 @@ __global__ void indexlist(Real_ptr x, Index_type* len, Index_type iend) { - // blocks do run in order in cuda an hip (this can be replaced with an atomic) + // blocks do start running in order in cuda an hip, so a block with a higher + // index can wait on a block with a lower index without deadlocking + // (replace with an atomicInc if this changes) const int block_id = blockIdx.x; Index_type val[items_per_thread] = { 0 }; From f7dc6ad2a90e9874b2de37d54a668d4fa1cfca7d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 8 Feb 2022 16:18:00 -0800 Subject: [PATCH 192/392] Rework to avoid local memory usage --- src/basic/INDEXLIST-Cuda.cpp | 44 ++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index c3e2fc770..31c84be2e 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -224,29 +224,35 @@ __global__ void indexlist(Real_ptr x, // (replace with an atomicInc if this changes) const int block_id = blockIdx.x; - Index_type val[items_per_thread] = { 0 }; - - for (Index_type ti = 0, i = block_id * block_size * items_per_thread + threadIdx.x; - ti < items_per_thread && i < iend; - ++ti, i += block_size) { - if (INDEXLIST_CONDITIONAL) { - val[ti] = 1; + Index_type vals[items_per_thread]; + + for (Index_type ti = 0; ti < items_per_thread; ++ti) { + Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x; + Index_type val = 0; + if (i < iend) { + if (INDEXLIST_CONDITIONAL) { + val = 1; + } } + vals[ti] = val; } - Index_type exclusive[items_per_thread]; - Index_type inclusive[items_per_thread]; + Index_type exclusives[items_per_thread]; + Index_type inclusives[items_per_thread]; grid_scan( - block_id, val, exclusive, inclusive, block_counts, grid_counts, block_readys); - - for (Index_type ti = 0, i = block_id * block_size * items_per_thread + threadIdx.x; - ti < items_per_thread && i < iend; - ++ti, i += block_size) { - if (exclusive[ti] != inclusive[ti]) { - list[exclusive[ti]] = i; - } - if (i == iend-1) { - *len = inclusive[ti]; + block_id, vals, exclusives, inclusives, block_counts, grid_counts, block_readys); + + for (Index_type ti = 0; ti < items_per_thread; ++ti) { + Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x; + Index_type exclusive = exclusives[ti]; + Index_type inclusive = inclusives[ti]; + if (i < iend) { + if (exclusive != inclusive) { + list[exclusive] = i; + } + if (i == iend-1) { + *len = inclusive; + } } } } From 13e00f21a64be9d8be18abc7523c3e8ee4a019b3 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 8 Feb 2022 16:26:32 -0800 Subject: [PATCH 193/392] Add launch bounds --- src/basic/INDEXLIST-Cuda.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index 31c84be2e..51d0243f5 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -211,6 +211,7 @@ __device__ void grid_scan(const int block_id, } template < size_t block_size, size_t items_per_thread > +__launch_bounds__(block_size) __global__ void indexlist(Real_ptr x, Int_ptr list, Index_type* block_counts, From cceb46d52b7226714572f17abc43f8da3e3c2db4 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 11 Feb 2022 14:03:46 -0800 Subject: [PATCH 194/392] minor fixes in cuda indexlist --- src/basic/INDEXLIST-Cuda.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index 51d0243f5..c083e2747 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -60,8 +60,8 @@ __device__ void grid_scan(const int block_id, const bool last_thread = (threadIdx.x == block_size-1); const bool last_warp = (threadIdx.x >= block_size - warp_size); const int warp_index = (threadIdx.x % warp_size); - const int warp_index_mask = (1u << warp_index); - const int warp_index_mask_right = warp_index_mask | (warp_index_mask - 1); + const unsigned warp_index_mask = (1u << warp_index); + const unsigned warp_index_mask_right = warp_index_mask | (warp_index_mask - 1u); using BlockScan = cub::BlockScan; //, cub::BLOCK_SCAN_WARP_SCANS>; using BlockExchange = cub::BlockExchange; @@ -220,7 +220,7 @@ __global__ void indexlist(Real_ptr x, Index_type* len, Index_type iend) { - // blocks do start running in order in cuda an hip, so a block with a higher + // blocks do start running in order in cuda and hip, so a block with a higher // index can wait on a block with a lower index without deadlocking // (replace with an atomicInc if this changes) const int block_id = blockIdx.x; From 29effe99dadbcec1d9ab68e22da9ecf090480a27 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 11 Feb 2022 14:05:05 -0800 Subject: [PATCH 195/392] Add hip indexlist base impl Port of base cuda impl using rocmprim scan tuning params. --- src/basic/INDEXLIST-Hip.cpp | 285 +++++++++++++++++++++++++++++++++++- src/basic/INDEXLIST.cpp | 2 + 2 files changed, 286 insertions(+), 1 deletion(-) diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp index c63b115fd..07a7f571f 100644 --- a/src/basic/INDEXLIST-Hip.cpp +++ b/src/basic/INDEXLIST-Hip.cpp @@ -14,6 +14,11 @@ #include "common/HipDataUtils.hpp" +#include +#include +#include +#include + #include namespace rajaperf @@ -21,9 +26,287 @@ namespace rajaperf namespace basic { + // + // Define thread block size for HIP execution + // + const size_t block_size = 256; + const size_t warp_size = 64; + const size_t items_per_thread = 8; + + +#define INDEXLIST_DATA_SETUP_HIP \ + allocAndInitHipDeviceData(x, m_x, iend); \ + allocAndInitHipDeviceData(list, m_list, iend); + +#define INDEXLIST_DATA_TEARDOWN_HIP \ + getHipDeviceData(m_list, list, iend); \ + deallocHipDeviceData(x); \ + deallocHipDeviceData(list); + + +// perform a grid scan on val and returns the result at each thread +// in exclusive and inclusive, note that val is used as scratch space +template < size_t block_size, size_t items_per_thread > +__device__ void grid_scan(const int block_id, + Index_type (&val)[items_per_thread], + Index_type (&exclusive)[items_per_thread], + Index_type (&inclusive)[items_per_thread], + Index_type* block_counts, + Index_type* grid_counts, + unsigned* block_readys) +{ + const bool first_block = (block_id == 0); + const bool last_block = (block_id == gridDim.x-1); + const bool last_thread = (threadIdx.x == block_size-1); + const bool last_warp = (threadIdx.x >= block_size - warp_size); + const int warp_index = (threadIdx.x % warp_size); + const unsigned long long warp_index_mask = (1ull << warp_index); + const unsigned long long warp_index_mask_right = warp_index_mask | (warp_index_mask - 1ull); + + using BlockScan = rocprim::block_scan; //, rocprim::block_scan_algorithm::reduce_then_scan>; + using BlockExchange = rocprim::block_exchange; + using WarpReduce = rocprim::warp_reduce; + + union SharedStorage { + typename BlockScan::storage_type block_scan_storage; + typename BlockExchange::storage_type block_exchange_storage; + typename WarpReduce::storage_type warp_reduce_storage; + }; + __shared__ SharedStorage s_temp_storage; + + + BlockExchange().striped_to_blocked(val, val, s_temp_storage.block_exchange_storage); + __syncthreads(); + + + BlockScan().exclusive_scan(val, exclusive, Index_type{0}, s_temp_storage.block_scan_storage); + __syncthreads(); + + for (int ti = 0; ti < items_per_thread; ++ti) { + inclusive[ti] = exclusive[ti] + val[ti]; + } + + BlockExchange().blocked_to_striped(exclusive, exclusive, s_temp_storage.block_exchange_storage); + __syncthreads(); + BlockExchange().blocked_to_striped(inclusive, inclusive, s_temp_storage.block_exchange_storage); + __syncthreads(); + if (first_block) { + + if (!last_block && last_thread) { + block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block + grid_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for grid through block + __threadfence(); // ensure block_counts, grid_counts ready (release) + atomicExch(&block_readys[block_id], 2u); // write block_counts, grid_counts are ready + } + + } else { + + if (!last_block && last_thread) { + block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block + __threadfence(); // ensure block_counts ready (release) + atomicExch(&block_readys[block_id], 1u); // write block_counts is ready + } + + __shared__ volatile Index_type s_prev_grid_count; + + // get prev_grid_count using last warp in block + if (last_warp) { + + Index_type prev_grid_count = 0; + + // accumulate previous block counts into registers of warp + + int prev_block_base_id = block_id - warp_size; + + unsigned prev_block_ready = 0u; + unsigned long long prev_blocks_ready_ballot = 0ull; + unsigned long long prev_grids_ready_ballot = 0ull; + + // accumulate full warp worths of block counts + // stop if run out of full warps of a grid count is ready + while (prev_block_base_id >= 0) { + + const int prev_block_id = prev_block_base_id + warp_index; + + // ensure previous block_counts are ready + do { + prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); + + prev_blocks_ready_ballot = __ballot(prev_block_ready >= 1u); + + } while (prev_blocks_ready_ballot != 0xffffffffffffffffull); + + prev_grids_ready_ballot = __ballot(prev_block_ready == 2u); + + if (prev_grids_ready_ballot != 0ull) { + break; + } + + __threadfence(); // ensure block_counts or grid_counts ready (acquire) + + // accumulate block_counts for prev_block_id + prev_grid_count += block_counts[prev_block_id]; + + prev_block_ready = 0u; + + prev_block_base_id -= warp_size; + } + + const int prev_block_id = prev_block_base_id + warp_index; + + // ensure previous block_counts are ready + // this checks that block counts is ready for all blocks above + // the highest grid count that is ready + while (~prev_blocks_ready_ballot >= prev_grids_ready_ballot) { + + if (prev_block_id >= 0) { + prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); + } + + prev_blocks_ready_ballot = __ballot(prev_block_ready >= 1u); + prev_grids_ready_ballot = __ballot(prev_block_ready == 2u); + } + __threadfence(); // ensure block_counts or grid_counts ready (acquire) + + // read one grid_count from a block with id grid_count_ready_id + // and read the block_counts from blocks with higher ids. + if (warp_index_mask > prev_grids_ready_ballot) { + // accumulate block_counts for prev_block_id + prev_grid_count += block_counts[prev_block_id]; + } else if (prev_grids_ready_ballot == (prev_grids_ready_ballot & warp_index_mask_right)) { + // accumulate grid_count for grid_count_ready_id + prev_grid_count += grid_counts[prev_block_id]; + } + + + WarpReduce().reduce(prev_grid_count, prev_grid_count, s_temp_storage.warp_reduce_storage); + prev_grid_count = __shfl(prev_grid_count, 0, warp_size); // broadcast output to all threads in warp + + if (last_thread) { + + if (!last_block) { + grid_counts[block_id] = prev_grid_count + inclusive[items_per_thread-1]; // write inclusive scan result for grid through block + __threadfence(); // ensure grid_counts ready (release) + atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready + } + + s_prev_grid_count = prev_grid_count; + } + } + + __syncthreads(); + Index_type prev_grid_count = s_prev_grid_count; + + for (int ti = 0; ti < items_per_thread; ++ti) { + exclusive[ti] = prev_grid_count + exclusive[ti]; + inclusive[ti] = prev_grid_count + inclusive[ti]; + } + + if (last_block) { + for (int i = threadIdx.x; i < gridDim.x-1; i += block_size) { + while (atomicCAS(&block_readys[i], 2u, 0u) != 2u); + } + } + } +} + +template < size_t block_size, size_t items_per_thread > +__launch_bounds__(block_size) +__global__ void indexlist(Real_ptr x, + Int_ptr list, + Index_type* block_counts, + Index_type* grid_counts, + unsigned* block_readys, + Index_type* len, + Index_type iend) +{ + // blocks do start running in order in cuda and hip, so a block with a higher + // index can wait on a block with a lower index without deadlocking + // (replace with an atomicInc if this changes) + const int block_id = blockIdx.x; + + Index_type vals[items_per_thread]; + + for (Index_type ti = 0; ti < items_per_thread; ++ti) { + Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x; + Index_type val = 0; + if (i < iend) { + if (INDEXLIST_CONDITIONAL) { + val = 1; + } + } + vals[ti] = val; + } + + Index_type exclusives[items_per_thread]; + Index_type inclusives[items_per_thread]; + grid_scan( + block_id, vals, exclusives, inclusives, block_counts, grid_counts, block_readys); + + for (Index_type ti = 0; ti < items_per_thread; ++ti) { + Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x; + Index_type exclusive = exclusives[ti]; + Index_type inclusive = inclusives[ti]; + if (i < iend) { + if (exclusive != inclusive) { + list[exclusive] = i; + } + if (i == iend-1) { + *len = inclusive; + } + } + } +} + void INDEXLIST::runHipVariant(VariantID vid) { - { + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + INDEXLIST_DATA_SETUP; + + if ( vid == Base_HIP ) { + + INDEXLIST_DATA_SETUP_HIP; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*items_per_thread); + const size_t shmem_size = 0; + + Index_type* len; + allocHipPinnedData(len, 1); + Index_type* block_counts; + allocHipDeviceData(block_counts, grid_size); + Index_type* grid_counts; + allocHipDeviceData(grid_counts, grid_size); + unsigned* block_readys; + allocHipDeviceData(block_readys, grid_size); + hipErrchk( hipMemset(block_readys, 0, sizeof(unsigned)*grid_size) ); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + indexlist + <<>>( + x+ibegin, list+ibegin, + block_counts, grid_counts, block_readys, + len, iend-ibegin ); + hipErrchk( hipGetLastError() ); + + hipErrchk( hipDeviceSynchronize() ); + m_len = *len; + + } + stopTimer(); + + deallocHipPinnedData(len); + deallocHipDeviceData(block_counts); + deallocHipDeviceData(grid_counts); + deallocHipDeviceData(block_readys); + + INDEXLIST_DATA_TEARDOWN_HIP; + + } else { std::cout << "\n INDEXLIST : Unknown variant id = " << vid << std::endl; } } diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index 5e5ba32cc..b21de6ae7 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -46,6 +46,8 @@ INDEXLIST::INDEXLIST(const RunParams& params) #endif setVariantDefined( Base_CUDA ); + + setVariantDefined( Base_HIP ); } INDEXLIST::~INDEXLIST() From 1350e933b31a47be80ad432f01d45bc5614159e5 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 11 Feb 2022 14:45:10 -0800 Subject: [PATCH 196/392] Reduce static shared memory use in indexlist Combine s_prev_grid_count with other static shared memory --- src/basic/INDEXLIST-Cuda.cpp | 7 +++---- src/basic/INDEXLIST-Hip.cpp | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index c083e2747..50b243e44 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -71,6 +71,7 @@ __device__ void grid_scan(const int block_id, typename BlockScan::TempStorage block_scan_storage; typename BlockExchange::TempStorage block_exchange_storage; typename WarpReduce::TempStorage warp_reduce_storage; + volatile Index_type prev_grid_count; }; __shared__ SharedStorage s_temp_storage; @@ -107,8 +108,6 @@ __device__ void grid_scan(const int block_id, atomicExch(&block_readys[block_id], 1u); // write block_counts is ready } - __shared__ volatile Index_type s_prev_grid_count; - // get prev_grid_count using last warp in block if (last_warp) { @@ -190,12 +189,12 @@ __device__ void grid_scan(const int block_id, atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready } - s_prev_grid_count = prev_grid_count; + s_temp_storage.prev_grid_count = prev_grid_count; } } __syncthreads(); - Index_type prev_grid_count = s_prev_grid_count; + Index_type prev_grid_count = s_temp_storage.prev_grid_count; for (int ti = 0; ti < items_per_thread; ++ti) { exclusive[ti] = prev_grid_count + exclusive[ti]; diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp index 07a7f571f..b76901608 100644 --- a/src/basic/INDEXLIST-Hip.cpp +++ b/src/basic/INDEXLIST-Hip.cpp @@ -71,6 +71,7 @@ __device__ void grid_scan(const int block_id, typename BlockScan::storage_type block_scan_storage; typename BlockExchange::storage_type block_exchange_storage; typename WarpReduce::storage_type warp_reduce_storage; + volatile Index_type prev_grid_count; }; __shared__ SharedStorage s_temp_storage; @@ -107,8 +108,6 @@ __device__ void grid_scan(const int block_id, atomicExch(&block_readys[block_id], 1u); // write block_counts is ready } - __shared__ volatile Index_type s_prev_grid_count; - // get prev_grid_count using last warp in block if (last_warp) { @@ -190,12 +189,12 @@ __device__ void grid_scan(const int block_id, atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready } - s_prev_grid_count = prev_grid_count; + s_temp_storage.prev_grid_count = prev_grid_count; } } __syncthreads(); - Index_type prev_grid_count = s_prev_grid_count; + Index_type prev_grid_count = s_temp_storage.prev_grid_count; for (int ti = 0; ti < items_per_thread; ++ti) { exclusive[ti] = prev_grid_count + exclusive[ti]; From 4b33bf1eae2ba0e0d9bb82dd49bf86bc165175a8 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Tue, 15 Feb 2022 15:37:06 -0600 Subject: [PATCH 197/392] adding function to allow compiler time looping over blocksizes in a camp int pack --- src/common/GPUUtils.hpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 560801ace..0fa67f118 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -243,6 +243,23 @@ struct ExactSqrt static constexpr bool valid() { return sqrt(I)*sqrt(I) == I; } }; +// helper function to get the Nth value from a camp int pack: +template +static constexpr size_t GetN(camp::int_seq sizes){ +return std::integral_constant(std::array { I... }) >(); +} +//compile time loop over an integer sequence +//this allows for creating a loop over a compile time +template +static void seq_for(camp::int_seq, Func func) { + (static_cast(f(std::integral_constant{})), ...); +} +template +static void seq_for(Func func) { + for_seq(camp::int_seq{}, func); +} + // A camp::int_seq of size_t's that is rajaperf::configuration::gpu_block_sizes // if rajaperf::configuration::gpu_block_sizes is not empty // and a camp::int_seq of default_block_size otherwise From b9e21fa8355dfa35739a92ce7b6b2ca430cb9ca5 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Tue, 15 Feb 2022 16:14:07 -0600 Subject: [PATCH 198/392] fixing typo in seq_for naming --- src/common/GPUUtils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 0fa67f118..5d6c9e1f6 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -257,7 +257,7 @@ static void seq_for(camp::int_seq, Func func) { } template static void seq_for(Func func) { - for_seq(camp::int_seq{}, func); + seq_for(camp::int_seq{}, func); } // A camp::int_seq of size_t's that is rajaperf::configuration::gpu_block_sizes From 3efff3ff06033db9e8e6054d47b1199455166dd0 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Tue, 15 Feb 2022 16:20:13 -0600 Subject: [PATCH 199/392] replacing std::integral_constant with camp::integral_constant --- src/common/GPUUtils.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 5d6c9e1f6..41956c9a6 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -246,14 +246,14 @@ struct ExactSqrt // helper function to get the Nth value from a camp int pack: template static constexpr size_t GetN(camp::int_seq sizes){ -return std::integral_constant(std::array { I... }) >(); } //compile time loop over an integer sequence //this allows for creating a loop over a compile time template static void seq_for(camp::int_seq, Func func) { - (static_cast(f(std::integral_constant{})), ...); + (static_cast(f(camp::integral_constant{})), ...); } template static void seq_for(Func func) { From 361a187d1efd6b2772a3438bfec9852a345748d6 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Tue, 15 Feb 2022 17:48:25 -0600 Subject: [PATCH 200/392] replace camp::integral_constant with camp::sink to avoid pack fold expression (c++17) --- src/common/GPUUtils.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 41956c9a6..d6bacb890 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -250,10 +250,10 @@ return camp::integral_constant(std::array { I... }) >(); } //compile time loop over an integer sequence -//this allows for creating a loop over a compile time +//this allows for creating a loop over a compile time constant variable template static void seq_for(camp::int_seq, Func func) { - (static_cast(f(camp::integral_constant{})), ...); + static_cast(camp::sink((f(camp::integral_constant{}), 0)...)); } template static void seq_for(Func func) { From 7ef7057076d94202c8eb611d1313b93efe043ae6 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 16 Feb 2022 10:41:44 -0800 Subject: [PATCH 201/392] Rename openmp5 scan guard --- CMakeLists.txt | 2 +- src/algorithm/SCAN-OMP.cpp | 2 +- src/algorithm/SCAN.cpp | 2 +- src/basic/INDEXLIST-OMP.cpp | 2 +- src/basic/INDEXLIST.cpp | 2 +- src/basic/INDEXLIST_3LOOP-OMP.cpp | 2 +- src/basic/INDEXLIST_3LOOP.cpp | 2 +- src/rajaperf_config.hpp.in | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index db860ba20..eeec4bfb4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,7 +35,7 @@ include(blt/SetupBLT.cmake) # cmake_dependent_option(RAJA_PERFSUITE_ENABLE_MPI "Build with MPI" On "ENABLE_MPI" Off) -cmake_dependent_option(RAJA_PERFSUITE_ENABLE_OPENMP_SCAN "Build OpenMP scan variants" Off "ENABLE_OPENMP" Off) +cmake_dependent_option(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN "Build OpenMP scan variants" Off "ENABLE_OPENMP" Off) # # Define RAJA settings... diff --git a/src/algorithm/SCAN-OMP.cpp b/src/algorithm/SCAN-OMP.cpp index 38fb09d89..ffa2ace05 100644 --- a/src/algorithm/SCAN-OMP.cpp +++ b/src/algorithm/SCAN-OMP.cpp @@ -30,7 +30,7 @@ void SCAN::runOpenMPVariant(VariantID vid) switch ( vid ) { -#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP_SCAN) +#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) case Base_OpenMP : { startTimer(); diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index 329f17548..cc026329e 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -43,7 +43,7 @@ SCAN::SCAN(const RunParams& params) setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); -#if defined(_OPENMP) && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP_SCAN) +#if defined(_OPENMP) && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); #endif diff --git a/src/basic/INDEXLIST-OMP.cpp b/src/basic/INDEXLIST-OMP.cpp index 2f5b41461..c7776704a 100644 --- a/src/basic/INDEXLIST-OMP.cpp +++ b/src/basic/INDEXLIST-OMP.cpp @@ -29,7 +29,7 @@ void INDEXLIST::runOpenMPVariant(VariantID vid) switch ( vid ) { -#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP_SCAN) +#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) case Base_OpenMP : { startTimer(); diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index b21de6ae7..55490d161 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -40,7 +40,7 @@ INDEXLIST::INDEXLIST(const RunParams& params) setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); -#if defined(_OPENMP) && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP_SCAN) +#if defined(_OPENMP) && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); #endif diff --git a/src/basic/INDEXLIST_3LOOP-OMP.cpp b/src/basic/INDEXLIST_3LOOP-OMP.cpp index 58762aa2e..8c97c5b14 100644 --- a/src/basic/INDEXLIST_3LOOP-OMP.cpp +++ b/src/basic/INDEXLIST_3LOOP-OMP.cpp @@ -36,7 +36,7 @@ void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid) switch ( vid ) { -#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP_SCAN) +#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) case Base_OpenMP : { INDEXLIST_3LOOP_DATA_SETUP_OMP; diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp index 3b5216392..5d84465fc 100644 --- a/src/basic/INDEXLIST_3LOOP.cpp +++ b/src/basic/INDEXLIST_3LOOP.cpp @@ -45,7 +45,7 @@ INDEXLIST_3LOOP::INDEXLIST_3LOOP(const RunParams& params) setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); -#if defined(_OPENMP) && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP_SCAN) +#if defined(_OPENMP) && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); #endif diff --git a/src/rajaperf_config.hpp.in b/src/rajaperf_config.hpp.in index 76120c81a..11434ef52 100644 --- a/src/rajaperf_config.hpp.in +++ b/src/rajaperf_config.hpp.in @@ -25,7 +25,7 @@ #include #cmakedefine RAJA_PERFSUITE_ENABLE_MPI -#cmakedefine RAJA_PERFSUITE_ENABLE_OPENMP_SCAN +#cmakedefine RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN namespace rajaperf { From 512aaa4d05ae6b8ee2f233b8f91485884b4b268d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 18 Feb 2022 11:21:09 -0800 Subject: [PATCH 202/392] Add non-omp5 SCAN implementation --- src/algorithm/SCAN-OMP.cpp | 94 +++++++++++++++++++++++++++++++++++--- src/algorithm/SCAN.cpp | 2 - 2 files changed, 87 insertions(+), 9 deletions(-) diff --git a/src/algorithm/SCAN-OMP.cpp b/src/algorithm/SCAN-OMP.cpp index ffa2ace05..990afb8e4 100644 --- a/src/algorithm/SCAN-OMP.cpp +++ b/src/algorithm/SCAN-OMP.cpp @@ -11,13 +11,13 @@ #include "RAJA/RAJA.hpp" #include +#include namespace rajaperf { namespace algorithm { - void SCAN::runOpenMPVariant(VariantID vid) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -30,19 +30,55 @@ void SCAN::runOpenMPVariant(VariantID vid) switch ( vid ) { -#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) case Base_OpenMP : { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { SCAN_PROLOGUE; + +#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) #pragma omp parallel for reduction(inscan, +:scan_var) for (Index_type i = ibegin; i < iend; ++i ) { y[i] = scan_var; #pragma omp scan exclusive(scan_var) scan_var += x[i]; } +#else + const Index_type n = iend - ibegin; + const int p0 = static_cast(std::min(n, static_cast(omp_get_max_threads()))); + ::std::vector data(p0); + + #pragma omp parallel num_threads(p0) + { + const int p = omp_get_num_threads(); + const int pid = omp_get_thread_num(); + const Index_type step = n / p; + const Index_type local_begin = pid * step + ibegin; + const Index_type local_end = (pid == p-1) ? iend : (pid+1) * step + ibegin; + + Real_type local_scan_var = (pid == 0) ? scan_var : 0; + for (Index_type i = local_begin; i < local_end; ++i ) { + y[i] = local_scan_var; + local_scan_var += x[i]; + } + data[pid] = local_scan_var; + + #pragma omp barrier + + if (pid != 0) { + + Real_type prev_sum = 0; + for (int ip = 0; ip < pid; ++ip) { + prev_sum += data[ip]; + } + + for (Index_type i = local_begin; i < local_end; ++i ) { + y[i] += prev_sum; + } + } + } +#endif } stopTimer(); @@ -55,23 +91,65 @@ void SCAN::runOpenMPVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - SCAN_PROLOGUE; - auto scan_lam = [=](Index_type i, Real_type& scan_var) { - y[i] = scan_var; + auto scan_lam_input = [=](Index_type i) { return x[i]; }; + auto scan_lam_sum_output = [=](Index_type i, Real_type sum_var) { + y[i] += sum_var; + }; + auto scan_lam_output = [=](Index_type i, Real_type scan_var) { + y[i] = scan_var; + }; + + SCAN_PROLOGUE; + +#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) #pragma omp parallel for reduction(inscan, +:scan_var) for (Index_type i = ibegin; i < iend; ++i ) { + scan_lam_output(i, scan_var); #pragma omp scan exclusive(scan_var) - scan_var += scan_lam(i, scan_var); + scan_var += scan_lam_input(i); + } +#else + const Index_type n = iend - ibegin; + const int p0 = static_cast(std::min(n, static_cast(omp_get_max_threads()))); + ::std::vector data(p0); + + #pragma omp parallel num_threads(p0) + { + const int p = omp_get_num_threads(); + const int pid = omp_get_thread_num(); + const Index_type step = n / p; + const Index_type local_begin = pid * step + ibegin; + const Index_type local_end = (pid == p-1) ? iend : (pid+1) * step + ibegin; + + Real_type local_scan_var = (pid == 0) ? scan_var : 0; + for (Index_type i = local_begin; i < local_end; ++i ) { + scan_lam_output(i, local_scan_var); + local_scan_var += scan_lam_input(i); + } + data[pid] = local_scan_var; + + #pragma omp barrier + + if (pid != 0) { + Real_type prev_sum = 0; + for (int ip = 0; ip < pid; ++ip) { + prev_sum += data[ip]; + } + + for (Index_type i = local_begin; i < local_end; ++i ) { + scan_lam_sum_output(i, prev_sum); + } + } } +#endif } stopTimer(); break; } -#endif case RAJA_OpenMP : { @@ -92,6 +170,8 @@ void SCAN::runOpenMPVariant(VariantID vid) } +#else + RAJA_UNUSED_VAR(vid); #endif } diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index cc026329e..9a309b039 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -43,10 +43,8 @@ SCAN::SCAN(const RunParams& params) setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); -#if defined(_OPENMP) && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); -#endif setVariantDefined( RAJA_OpenMP ); setVariantDefined( Base_CUDA ); From 6d8364daa9c4476dcc687b8b2c6983432b129f14 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 18 Feb 2022 11:23:02 -0800 Subject: [PATCH 203/392] Only rank 0 creates files Previously all ranks attempted to create files which could cause a failure to make files error. --- src/common/OutputUtils.cpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/common/OutputUtils.cpp b/src/common/OutputUtils.cpp index 5aea14855..98b48ef38 100644 --- a/src/common/OutputUtils.cpp +++ b/src/common/OutputUtils.cpp @@ -9,6 +9,10 @@ #include "RAJAPerfSuite.hpp" #include "OutputUtils.hpp" +#ifdef RAJA_PERFSUITE_ENABLE_MPI +#include +#endif + #include #include #include @@ -27,6 +31,16 @@ namespace rajaperf */ std::string recursiveMkdir(const std::string& in_path) { + int rank = 0; +#ifdef RAJA_PERFSUITE_ENABLE_MPI + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + // Processes wait for rank 0 to make the directories before proceeding + if (rank != 0) { + MPI_Barrier(MPI_COMM_WORLD); + } +#endif + std::string dir; std::string path = in_path; @@ -125,6 +139,13 @@ std::string recursiveMkdir(const std::string& in_path) delete[] path_buf; +#ifdef RAJA_PERFSUITE_ENABLE_MPI + // Rank 0 lets the other processes know it made the directories + if (rank == 0) { + MPI_Barrier(MPI_COMM_WORLD); + } +#endif + return outpath; } From e3ee95589b628b2e70b136e2905463f1d4409ae7 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 18 Feb 2022 11:23:23 -0800 Subject: [PATCH 204/392] Add toss3 mvapich2 gcc script --- scripts/lc-builds/toss3_mvapich2_gcc.sh | 49 +++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100755 scripts/lc-builds/toss3_mvapich2_gcc.sh diff --git a/scripts/lc-builds/toss3_mvapich2_gcc.sh b/scripts/lc-builds/toss3_mvapich2_gcc.sh new file mode 100755 index 000000000..ffa299786 --- /dev/null +++ b/scripts/lc-builds/toss3_mvapich2_gcc.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [ "$1" == "" ]; then + echo + echo "You must pass a compiler version number to script. For example," + echo " toss3_mvapich2_gcc.sh 2.3 10.2.1" + exit +fi + +MPI_VER=$1 +COMP_VER=$2 +shift 2 + +BUILD_SUFFIX=lc_toss3-mvapich2-${MPI_VER}-gcc-${COMP_VER} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/gcc_X.cmake + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo + +rm -rf build_${BUILD_SUFFIX} 2>/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + +module load cmake/3.14.5 + +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DMPI_CXX_COMPILER=/usr/tce/packages/mvapich2/mvapich2-${MPI_VER}-gcc-${COMP_VER}/bin/mpic++ \ + -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/g++ \ + -C ${RAJA_HOSTCONFIG} \ + -DENABLE_MPI=On \ + -DENABLE_OPENMP=On \ + -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + "$@" \ + .. + +echo +echo "***********************************************************************" +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" +echo "***********************************************************************" From 64db6b411f4498875c94e24ceef566904aae2863 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 18 Feb 2022 13:32:49 -0800 Subject: [PATCH 205/392] Remind user to run with mpi in build script --- scripts/lc-builds/blueos_spectrum_nvcc_clang.sh | 4 ++-- scripts/lc-builds/toss3_mvapich2_gcc.sh | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh b/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh index 33b173b31..18f33670c 100755 --- a/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh +++ b/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh @@ -61,8 +61,8 @@ echo "***********************************************************************" echo echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" echo -echo " Please note that you have to disable CUDA GPU hooks when you run" -echo " the RAJA Perf Suite; for example," +echo " Please note that you have to run with mpi and disable CUDA GPU hooks" +echo " when you run the RAJA Perf Suite; for example," echo echo " lrun -n4 ./bin/raja-perf.exe" echo diff --git a/scripts/lc-builds/toss3_mvapich2_gcc.sh b/scripts/lc-builds/toss3_mvapich2_gcc.sh index ffa299786..4ade9549c 100755 --- a/scripts/lc-builds/toss3_mvapich2_gcc.sh +++ b/scripts/lc-builds/toss3_mvapich2_gcc.sh @@ -45,5 +45,12 @@ cmake \ echo echo "***********************************************************************" +echo echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" +echo +echo " Please note that you have to run with mpi when you run" +echo " the RAJA Perf Suite; for example," +echo +echo " srun -n2 ./bin/raja-perf.exe" +echo echo "***********************************************************************" From ce8ddf8c052f5a9bd3b3527740fafe38a20211a9 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 18 Feb 2022 15:14:19 -0800 Subject: [PATCH 206/392] Move allocations in SCAN and rename --- src/algorithm/SCAN-OMP.cpp | 44 +++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/src/algorithm/SCAN-OMP.cpp b/src/algorithm/SCAN-OMP.cpp index 990afb8e4..8ddec7f36 100644 --- a/src/algorithm/SCAN-OMP.cpp +++ b/src/algorithm/SCAN-OMP.cpp @@ -32,6 +32,13 @@ void SCAN::runOpenMPVariant(VariantID vid) case Base_OpenMP : { +#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) +#else + const Index_type n = iend - ibegin; + const int p0 = static_cast(std::min(n, static_cast(omp_get_max_threads()))); + ::std::vector thread_sums(p0); +#endif + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -45,10 +52,6 @@ void SCAN::runOpenMPVariant(VariantID vid) scan_var += x[i]; } #else - const Index_type n = iend - ibegin; - const int p0 = static_cast(std::min(n, static_cast(omp_get_max_threads()))); - ::std::vector data(p0); - #pragma omp parallel num_threads(p0) { const int p = omp_get_num_threads(); @@ -62,7 +65,7 @@ void SCAN::runOpenMPVariant(VariantID vid) y[i] = local_scan_var; local_scan_var += x[i]; } - data[pid] = local_scan_var; + thread_sums[pid] = local_scan_var; #pragma omp barrier @@ -70,7 +73,7 @@ void SCAN::runOpenMPVariant(VariantID vid) Real_type prev_sum = 0; for (int ip = 0; ip < pid; ++ip) { - prev_sum += data[ip]; + prev_sum += thread_sums[ip]; } for (Index_type i = local_begin; i < local_end; ++i ) { @@ -88,9 +91,12 @@ void SCAN::runOpenMPVariant(VariantID vid) case Lambda_OpenMP : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - +#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) + auto scan_lam = [=](Index_type i, Real_type scan_var) { + y[i] = scan_var; + return x[i]; + }; +#else auto scan_lam_input = [=](Index_type i) { return x[i]; }; @@ -101,20 +107,24 @@ void SCAN::runOpenMPVariant(VariantID vid) y[i] = scan_var; }; + const Index_type n = iend - ibegin; + const int p0 = static_cast(std::min(n, static_cast(omp_get_max_threads()))); + ::std::vector thread_sums(p0); +#endif + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + SCAN_PROLOGUE; #if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) #pragma omp parallel for reduction(inscan, +:scan_var) for (Index_type i = ibegin; i < iend; ++i ) { - scan_lam_output(i, scan_var); #pragma omp scan exclusive(scan_var) - scan_var += scan_lam_input(i); + scan_var += scan_lam(i, scan_var); } #else - const Index_type n = iend - ibegin; - const int p0 = static_cast(std::min(n, static_cast(omp_get_max_threads()))); - ::std::vector data(p0); - #pragma omp parallel num_threads(p0) { const int p = omp_get_num_threads(); @@ -128,14 +138,14 @@ void SCAN::runOpenMPVariant(VariantID vid) scan_lam_output(i, local_scan_var); local_scan_var += scan_lam_input(i); } - data[pid] = local_scan_var; + thread_sums[pid] = local_scan_var; #pragma omp barrier if (pid != 0) { Real_type prev_sum = 0; for (int ip = 0; ip < pid; ++ip) { - prev_sum += data[ip]; + prev_sum += thread_sums[ip]; } for (Index_type i = local_begin; i < local_end; ++i ) { From 6d2d32a54eb11b9c7423ded0416d408243e1b802 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 18 Feb 2022 15:14:49 -0800 Subject: [PATCH 207/392] Implement omp<5 scan in INDEXLIST --- src/basic/INDEXLIST-OMP.cpp | 112 ++++++++++++++++++++++++++++++++++-- src/basic/INDEXLIST.cpp | 2 - 2 files changed, 108 insertions(+), 6 deletions(-) diff --git a/src/basic/INDEXLIST-OMP.cpp b/src/basic/INDEXLIST-OMP.cpp index c7776704a..576002da7 100644 --- a/src/basic/INDEXLIST-OMP.cpp +++ b/src/basic/INDEXLIST-OMP.cpp @@ -29,14 +29,22 @@ void INDEXLIST::runOpenMPVariant(VariantID vid) switch ( vid ) { -#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) case Base_OpenMP : { +#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) +#else + const Index_type n = iend - ibegin; + ::std::vector tmp_scan(n); + const int p0 = static_cast(std::min(n, static_cast(omp_get_max_threads()))); + ::std::vector thread_sums(p0); +#endif + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { Index_type count = 0; +#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) #pragma omp parallel for reduction(inscan, +:count) for (Index_type i = ibegin; i < iend; ++i ) { Index_type inc = 0; @@ -47,6 +55,47 @@ void INDEXLIST::runOpenMPVariant(VariantID vid) #pragma omp scan exclusive(count) count += inc; } +#else + #pragma omp parallel num_threads(p0) + { + const int p = omp_get_num_threads(); + const int pid = omp_get_thread_num(); + const Index_type step = n / p; + const Index_type local_begin = pid * step + ibegin; + const Index_type local_end = (pid == p-1) ? iend : (pid+1) * step + ibegin; + + Index_type local_sum_var = 0; + for (Index_type i = local_begin; i < local_end; ++i ) { + + Index_type inc = 0; + if (INDEXLIST_CONDITIONAL) { + inc = 1; + } + tmp_scan[i] = inc; + local_sum_var += inc; + } + thread_sums[pid] = local_sum_var; + + #pragma omp barrier + + Index_type local_count_var = 0; + for (int ip = 0; ip < pid; ++ip) { + local_count_var += thread_sums[ip]; + } + + for (Index_type i = local_begin; i < local_end; ++i ) { + Index_type inc = tmp_scan[i]; + if (inc) { + list[local_count_var] = i ; + } + local_count_var += inc; + } + + if (pid == p-1) { + count = local_count_var; + } + } +#endif m_len = count; @@ -58,7 +107,8 @@ void INDEXLIST::runOpenMPVariant(VariantID vid) case Lambda_OpenMP : { - auto indexlist_base_lam = [=](Index_type i, Index_type& count) { +#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) + auto indexlist_lam = [=](Index_type i, Index_type count) { Index_type inc = 0; if (INDEXLIST_CONDITIONAL) { list[count] = i ; @@ -66,17 +116,72 @@ void INDEXLIST::runOpenMPVariant(VariantID vid) } return inc; }; +#else + auto indexlist_lam_input = [=](Index_type i) { + Index_type inc = 0; + if (INDEXLIST_CONDITIONAL) { + inc = 1; + } + return inc; + }; + auto indexlist_lam_output = [=](Index_type i, Index_type count) { + list[count] = i ; + }; + const Index_type n = iend - ibegin; + ::std::vector tmp_scan(n); + const int p0 = static_cast(std::min(n, static_cast(omp_get_max_threads()))); + ::std::vector thread_sums(p0); +#endif startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { Index_type count = 0; +#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) #pragma omp parallel for reduction(inscan, +:count) for (Index_type i = ibegin; i < iend; ++i ) { #pragma omp scan exclusive(count) - count += indexlist_base_lam(i, count); + count += indexlist_lam(i, count); + } +#else + #pragma omp parallel num_threads(p0) + { + const int p = omp_get_num_threads(); + const int pid = omp_get_thread_num(); + const Index_type step = n / p; + const Index_type local_begin = pid * step + ibegin; + const Index_type local_end = (pid == p-1) ? iend : (pid+1) * step + ibegin; + + Index_type local_sum_var = 0; + for (Index_type i = local_begin; i < local_end; ++i ) { + + Index_type inc = indexlist_lam_input(i); + tmp_scan[i] = inc; + local_sum_var += inc; + } + thread_sums[pid] = local_sum_var; + + #pragma omp barrier + + Index_type local_count_var = 0; + for (int ip = 0; ip < pid; ++ip) { + local_count_var += thread_sums[ip]; + } + + for (Index_type i = local_begin; i < local_end; ++i ) { + Index_type inc = tmp_scan[i]; + if (inc) { + indexlist_lam_output(i, local_count_var); + } + local_count_var += inc; + } + + if (pid == p-1) { + count = local_count_var; + } } +#endif m_len = count; @@ -85,7 +190,6 @@ void INDEXLIST::runOpenMPVariant(VariantID vid) break; } -#endif default : { ignore_unused(run_reps, ibegin, iend, x, list); diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index 55490d161..d1ce5cc77 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -40,10 +40,8 @@ INDEXLIST::INDEXLIST(const RunParams& params) setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); -#if defined(_OPENMP) && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); -#endif setVariantDefined( Base_CUDA ); From 189bbed98649e53bfa136d76ccfe28a070f807f8 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 18 Feb 2022 15:16:58 -0800 Subject: [PATCH 208/392] Fix comment in blueos mpi script --- scripts/lc-builds/blueos_spectrum_nvcc_clang.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh b/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh index 18f33670c..a61f48ece 100755 --- a/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh +++ b/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh @@ -61,8 +61,8 @@ echo "***********************************************************************" echo echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" echo -echo " Please note that you have to run with mpi and disable CUDA GPU hooks" -echo " when you run the RAJA Perf Suite; for example," +echo " Please note that you have to run with mpi when you run" +echo " the RAJA Perf Suite; for example," echo echo " lrun -n4 ./bin/raja-perf.exe" echo From b894f0e49994fc42b84562a27b7a3cd699faeaec Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 18 Feb 2022 15:45:15 -0800 Subject: [PATCH 209/392] Implement omp<5 scan in INDEXLIST_3LOOP --- src/basic/INDEXLIST_3LOOP-OMP.cpp | 84 +++++++++++++++++++++++++++++-- src/basic/INDEXLIST_3LOOP.cpp | 2 - 2 files changed, 80 insertions(+), 6 deletions(-) diff --git a/src/basic/INDEXLIST_3LOOP-OMP.cpp b/src/basic/INDEXLIST_3LOOP-OMP.cpp index 8c97c5b14..df8eec8b6 100644 --- a/src/basic/INDEXLIST_3LOOP-OMP.cpp +++ b/src/basic/INDEXLIST_3LOOP-OMP.cpp @@ -36,11 +36,17 @@ void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid) switch ( vid ) { -#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) case Base_OpenMP : { INDEXLIST_3LOOP_DATA_SETUP_OMP; +#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) +#else + const Index_type n = iend+1 - ibegin; + const int p0 = static_cast(std::min(n, static_cast(omp_get_max_threads()))); + ::std::vector thread_counts(p0); +#endif + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -49,8 +55,8 @@ void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid) counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0; } +#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) Index_type count = 0; - #pragma omp parallel for reduction(inscan, +:count) for (Index_type i = ibegin; i < iend+1; ++i ) { Index_type inc = counts[i]; @@ -58,6 +64,38 @@ void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid) #pragma omp scan exclusive(count) count += inc; } +#else + #pragma omp parallel num_threads(p0) + { + const int p = omp_get_num_threads(); + const int pid = omp_get_thread_num(); + const Index_type step = n / p; + const Index_type local_begin = pid * step + ibegin; + const Index_type local_end = (pid == p-1) ? iend+1 : (pid+1) * step + ibegin; + + Index_type local_count = 0; + for (Index_type i = local_begin; i < local_end; ++i ) { + Index_type inc = counts[i]; + counts[i] = local_count; + local_count += inc; + } + thread_counts[pid] = local_count; + + #pragma omp barrier + + if (pid != 0) { + + Index_type prev_count = 0; + for (int ip = 0; ip < pid; ++ip) { + prev_count += thread_counts[ip]; + } + + for (Index_type i = local_begin; i < local_end; ++i ) { + counts[i] += prev_count; + } + } + } +#endif #pragma omp parallel for for (Index_type i = ibegin; i < iend; ++i ) { @@ -86,6 +124,13 @@ void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid) INDEXLIST_3LOOP_MAKE_LIST; }; +#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) +#else + const Index_type n = iend+1 - ibegin; + const int p0 = static_cast(std::min(n, static_cast(omp_get_max_threads()))); + ::std::vector thread_counts(p0); +#endif + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -94,8 +139,8 @@ void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid) indexlist_conditional_lam(i); } +#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) Index_type count = 0; - #pragma omp parallel for reduction(inscan, +:count) for (Index_type i = ibegin; i < iend+1; ++i ) { Index_type inc = counts[i]; @@ -103,6 +148,38 @@ void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid) #pragma omp scan exclusive(count) count += inc; } +#else + #pragma omp parallel num_threads(p0) + { + const int p = omp_get_num_threads(); + const int pid = omp_get_thread_num(); + const Index_type step = n / p; + const Index_type local_begin = pid * step + ibegin; + const Index_type local_end = (pid == p-1) ? iend+1 : (pid+1) * step + ibegin; + + Index_type local_count = 0; + for (Index_type i = local_begin; i < local_end; ++i ) { + Index_type inc = counts[i]; + counts[i] = local_count; + local_count += inc; + } + thread_counts[pid] = local_count; + + #pragma omp barrier + + if (pid != 0) { + + Index_type prev_count = 0; + for (int ip = 0; ip < pid; ++ip) { + prev_count += thread_counts[ip]; + } + + for (Index_type i = local_begin; i < local_end; ++i ) { + counts[i] += prev_count; + } + } + } +#endif #pragma omp parallel for for (Index_type i = ibegin; i < iend; ++i ) { @@ -118,7 +195,6 @@ void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid) break; } -#endif case RAJA_OpenMP : { diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp index 5d84465fc..fff81d7fe 100644 --- a/src/basic/INDEXLIST_3LOOP.cpp +++ b/src/basic/INDEXLIST_3LOOP.cpp @@ -45,10 +45,8 @@ INDEXLIST_3LOOP::INDEXLIST_3LOOP(const RunParams& params) setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); -#if defined(_OPENMP) && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); -#endif setVariantDefined( RAJA_OpenMP ); setVariantDefined( Base_CUDA ); From 1b8dbde7675eb6c5591bc50c422fb5bb688544d6 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 23 Feb 2022 10:35:54 -0800 Subject: [PATCH 210/392] require c++14 --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9d9e6d704..80352acf5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,8 +25,8 @@ set(ENABLE_TESTS Off CACHE BOOL "Enable BLT and RAJA tests") set(ENABLE_EXAMPLES Off CACHE BOOL "Enable RAJA examples") set(RAJA_ENABLE_EXERCISES Off CACHE BOOL "Enable RAJA exercises") -set(CMAKE_CXX_STANDARD 11) -set(BLT_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD 14) +set(BLT_CXX_STANDARD 14) include(blt/SetupBLT.cmake) @@ -106,7 +106,7 @@ set(RAJAPERF_BUILD_SYSTYPE $ENV{SYS_TYPE}) set(RAJAPERF_BUILD_HOST $ENV{HOSTNAME}) if (ENABLE_CUDA) - set(CMAKE_CUDA_STANDARD 11) + set(CMAKE_CUDA_STANDARD 14) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -restrict -arch ${CUDA_ARCH} --expt-extended-lambda --expt-relaxed-constexpr") set(RAJAPERF_COMPILER "${CUDA_NVCC_EXECUTABLE}") From ada5978d9c1714bb0268fa8879caff9c3a8079d5 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 23 Feb 2022 10:37:28 -0800 Subject: [PATCH 211/392] Add tuning support Add support for multiple "tunings" of each variant. --- src/basic/DAXPY-Cuda.cpp | 27 +- src/basic/DAXPY.cpp | 14 +- src/basic/DAXPY.hpp | 14 +- src/common/Executor.cpp | 527 ++++++++++++++++++++++++-------------- src/common/Executor.hpp | 8 +- src/common/GPUUtils.hpp | 30 +-- src/common/KernelBase.cpp | 157 +++++++++--- src/common/KernelBase.hpp | 118 ++++++--- src/common/RunParams.cpp | 44 +++- src/common/RunParams.hpp | 15 +- 10 files changed, 621 insertions(+), 333 deletions(-) diff --git a/src/basic/DAXPY-Cuda.cpp b/src/basic/DAXPY-Cuda.cpp index f0c2591a9..7e7705b0b 100644 --- a/src/basic/DAXPY-Cuda.cpp +++ b/src/basic/DAXPY-Cuda.cpp @@ -110,13 +110,28 @@ void DAXPY::runCudaVariantImpl(VariantID vid) } } -void DAXPY::runCudaVariant(VariantID vid) +void DAXPY::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n DAXPY : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void DAXPY::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp index b04c70d23..578b52f42 100644 --- a/src/basic/DAXPY.cpp +++ b/src/basic/DAXPY.cpp @@ -21,10 +21,6 @@ namespace basic DAXPY::DAXPY(const RunParams& params) : KernelBase(rajaperf::Basic_DAXPY, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(500); @@ -68,9 +64,9 @@ void DAXPY::setUp(VariantID vid) initData(m_a); } -void DAXPY::updateChecksum(VariantID vid) +void DAXPY::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_y, getActualProblemSize()); + checksum[vid].at(tid) += calcChecksum(m_y, getActualProblemSize()); } void DAXPY::tearDown(VariantID vid) @@ -80,11 +76,5 @@ void DAXPY::tearDown(VariantID vid) deallocData(m_y); } -bool DAXPY::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/DAXPY.hpp b/src/basic/DAXPY.hpp index 1e6861fbe..4ee25ab6b 100644 --- a/src/basic/DAXPY.hpp +++ b/src/basic/DAXPY.hpp @@ -44,16 +44,16 @@ class DAXPY : public KernelBase ~DAXPY(); void setUp(VariantID vid); - void updateChecksum(VariantID vid); + void updateChecksum(VariantID vid, size_t tid); void tearDown(VariantID vid); - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); - bool isGPUBlockSizeSupported() const; + void setCudaTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index d3f3bfea6..447af3511 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -41,7 +42,8 @@ using namespace std; Executor::Executor(int argc, char** argv) : run_params(argc, argv), - reference_vid(NumVariants) + reference_vid(NumVariants), + reference_tid(std::numeric_limits::max()) { } @@ -547,6 +549,27 @@ void Executor::setupSuite() variant_ids.push_back( *vid ); } + // + // Make a single ordering of tuning names for each variant across kernels. + // + for (VariantID vid : variant_ids) { + std::unordered_map tuning_names_order_map; + for (const KernelBase* kernel : kernels) { + for (std::string const& tuning_name : + kernel->getVariantTuningNames(vid)) { + if (tuning_names_order_map.find(tuning_name) == + tuning_names_order_map.end()) { + tuning_names_order_map.emplace( + tuning_name, tuning_names_order_map.size()); + } + } + } + tuning_names[vid].resize(tuning_names_order_map.size()); + for (auto const& tuning_name_idx_pair : tuning_names_order_map) { + tuning_names[vid][tuning_name_idx_pair.second] = tuning_name_idx_pair.first; + } + } + // // If we've gotten to this point, we have good input to run. // @@ -615,11 +638,6 @@ void Executor::reportRunSummary(ostream& str) const } else if (run_params.getSizeMeaning() == RunParams::SizeMeaning::Direct) { str << "\t Kernel size = " << run_params.getSize() << endl; } - if (run_params.getGPUBlockSize() > 0) { - str << "\t Kernel GPU block_size = " << run_params.getGPUBlockSize() << endl; - } else { - str << "\t Kernel GPU block_size = " << "default" << endl; - } str << "\t Kernel rep factor = " << run_params.getRepFactor() << endl; str << "\t Output files will be named " << ofiles << endl; @@ -629,6 +647,11 @@ void Executor::reportRunSummary(ostream& str) const << "\n--------\n"; for (size_t iv = 0; iv < variant_ids.size(); ++iv) { str << getVariantName(variant_ids[iv]) << endl; + + str << "\n\tTunings\n"; + for (std::string const& tuning_name : tuning_names[variant_ids[iv]]) { + str << "\t" << tuning_name << endl; + } } str << endl; @@ -665,7 +688,6 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const Index_type itsrep_width = 0; Index_type bytesrep_width = 0; Index_type flopsrep_width = 0; - Index_type bsize_width = 0; Index_type dash_width = 0; for (size_t ik = 0; ik < kernels.size(); ++ik) { @@ -675,7 +697,6 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const itsrep_width = max(reps_width, kernels[ik]->getItsPerRep()); bytesrep_width = max(bytesrep_width, kernels[ik]->getBytesPerRep()); flopsrep_width = max(bytesrep_width, kernels[ik]->getFLOPsPerRep()); - bsize_width = max(bsize_width, static_cast(kernels[ik]->getActualGPUBlockSize())); } const string sepchr(" , "); @@ -719,12 +740,6 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const static_cast(frsize) ) + 3; dash_width += flopsrep_width + static_cast(sepchr.size()); - double bsize = log10( static_cast(bsize_width) ); - string bsize_head("GPU block size"); - bsize_width = max( static_cast(bsize_head.size()), - static_cast(bsize) ) + 3; - dash_width += bsize_width + static_cast(sepchr.size()); - str <getKernelsPerRep() << sepchr <getBytesPerRep() << sepchr <getFLOPsPerRep() - << sepchr <getActualGPUBlockSize() << endl; } @@ -808,16 +822,6 @@ template < typename Kernel > KernelBase* Executor::makeKernel() { Kernel* kernel = new Kernel(run_params); - // check gpu block size in run_params is supported by kernel - if (!kernel->isGPUBlockSizeSupported() && - run_params.getGPUBlockSize() != 0) { - // make Kernel with default gpu block size - delete kernel; kernel = nullptr; - size_t block_size = run_params.getGPUBlockSize(); - run_params.setGPUBlockSize(0); - kernel = new Kernel(run_params); - run_params.setGPUBlockSize(block_size); - } return kernel; } @@ -832,10 +836,16 @@ void Executor::runKernel(KernelBase* kern) } else { getCout() << " No "; } - getCout() << kern->getVariantName(vid) << " variant" << endl; + getCout() << getVariantName(vid) << " variant" << endl; } - if ( kern->hasVariantDefined(vid) ) { - kern->execute(vid); + + for (size_t tid = 0; tid < kern->getNumVariantTunings(vid); ++tid) { + + if ( run_params.showProgress() ) { + getCout() << " Running " + << kern->getVariantTuningName(vid, tid) << " tuning" << endl; + } + kern->execute(vid, tid); } } // loop over variants } @@ -925,9 +935,12 @@ void Executor::writeCSVReport(ostream& file, CSVRepMode mode, } kercol_width++; - vector varcol_width(variant_ids.size()); + vector> vartuncol_width(variant_ids.size()); for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - varcol_width[iv] = max(prec+2, getVariantName(variant_ids[iv]).size()); + size_t var_width = max(prec+2, getVariantName(variant_ids[iv]).size()); + for (std::string const& tuning_name : tuning_names[variant_ids[iv]]) { + vartuncol_width[iv].emplace_back(max(var_width, tuning_name.size())); + } } // @@ -940,17 +953,33 @@ void Executor::writeCSVReport(ostream& file, CSVRepMode mode, // for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - file << sepchr; + for (size_t it = 0; it < tuning_names[variant_ids[iv]].size(); ++it) { + file << sepchr; + } } file << endl; // - // Print column title line. + // Print column variant name line. + // + file <getName(); for (size_t iv = 0; iv < variant_ids.size(); ++iv) { VariantID vid = variant_ids[iv]; - file << sepchr <hasVariantDefined(reference_vid) || - !kern->hasVariantDefined(vid)) ) { - file << "Not run"; - } else if ( (mode == CSVRepMode::Timing) && - !kern->hasVariantDefined(vid) ) { - file << "Not run"; - } else { - file << setprecision(prec) << std::fixed - << getReportDataEntry(mode, combiner, kern, vid); + for (size_t it = 0; it < tuning_names[variant_ids[iv]].size(); ++it) { + std::string const& tuning_name = tuning_names[variant_ids[iv]][it]; + file << sepchr <hasVariantTuningDefined(reference_vid, tuning_name) || + !kern->hasVariantTuningDefined(vid, tuning_name)) ) { + file << "Not run"; + } else if ( (mode == CSVRepMode::Timing) && + !kern->hasVariantTuningDefined(vid, tuning_name) ) { + file << "Not run"; + } else { + file << setprecision(prec) << std::fixed + << getReportDataEntry(mode, combiner, kern, vid, + kern->getVariantTuningIndex(vid, tuning_name)); + } } } file << endl; @@ -1003,40 +1036,64 @@ void Executor::writeFOMReport(ostream& file, vector& fom_groups) size_t fom_col_width = prec+14; - size_t ncols = 0; + std::vector fom_group_ncols(fom_groups.size(), 0); for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { const FOMGroup& group = fom_groups[ifg]; - ncols += group.variants.size(); // num variants to compare - // to each PM baseline + + for (size_t gv = 0; gv < group.variants.size(); ++gv) { + VariantID vid = group.variants[gv]; + const string& variant_name = getVariantName(vid); + // num variants and tuning + // Includes the PM baseline and the variants and tunings to compared to it + fom_group_ncols[ifg] += tuning_names[vid].size(); + for (const string& tuning_name : tuning_names[vid]) { + fom_col_width = max(fom_col_width, variant_name.size()+1+tuning_name.size()); + } + } } - vector col_exec_count(ncols, 0); - vector col_min(ncols, numeric_limits::max()); - vector col_max(ncols, -numeric_limits::max()); - vector col_avg(ncols, 0.0); - vector col_stddev(ncols, 0.0); - vector< vector > pct_diff(kernels.size()); + vector< vector > col_exec_count(fom_groups.size()); + vector< vector > col_min(fom_groups.size()); + vector< vector > col_max(fom_groups.size()); + vector< vector > col_avg(fom_groups.size()); + vector< vector > col_stddev(fom_groups.size()); + for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { + col_exec_count[ifg].resize(fom_group_ncols[ifg], 0); + col_min[ifg].resize(fom_group_ncols[ifg], numeric_limits::max()); + col_max[ifg].resize(fom_group_ncols[ifg], -numeric_limits::max()); + col_avg[ifg].resize(fom_group_ncols[ifg], 0.0); + col_stddev[ifg].resize(fom_group_ncols[ifg], 0.0); + } + vector< vector< vector > > pct_diff(kernels.size()); for (size_t ik = 0; ik < kernels.size(); ++ik) { - pct_diff[ik] = vector(ncols, 0.0); + pct_diff[ik].resize(fom_groups.size()); + for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { + pct_diff[ik][ifg].resize(fom_group_ncols[ifg], 0.0); + } } // // Print title line. // file << "FOM Report : signed speedup(-)/slowdown(+) for each PM (base vs. RAJA) -> (T_RAJA - T_base) / T_base )"; - for (size_t iv = 0; iv < ncols*2; ++iv) { - file << sepchr; + for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { + for (size_t iv = 0; iv < fom_group_ncols[ifg]*2; ++iv) { + file << sepchr; + } } file << endl; file << "'OVER_TOL' in column to right if RAJA speedup is over tolerance"; - for (size_t iv = 0; iv < ncols*2; ++iv) { - file << sepchr; + for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { + for (size_t iv = 0; iv < fom_group_ncols[ifg]*2; ++iv) { + file << sepchr; + } } file << endl; string pass(", "); string fail(",OVER_TOL"); + string base(",base_ref"); // // Print column title line. @@ -1045,8 +1102,12 @@ void Executor::writeFOMReport(ostream& file, vector& fom_groups) for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { const FOMGroup& group = fom_groups[ifg]; for (size_t gv = 0; gv < group.variants.size(); ++gv) { - string name = getVariantName(group.variants[gv]); - file << sepchr <& fom_groups) file <getName(); - int col = 0; for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { const FOMGroup& group = fom_groups[ifg]; - VariantID base_vid = group.base; + constexpr double unknown_totTime = -1.0; + double base_totTime = unknown_totTime; + size_t col = 0; for (size_t gv = 0; gv < group.variants.size(); ++gv) { - VariantID comp_vid = group.variants[gv]; - - // - // If kernel variant was run, generate data for it and - // print (signed) percentage difference from baseline. - // - if ( kern->wasVariantRun(comp_vid) ) { - col_exec_count[col]++; - - pct_diff[ik][col] = - (kern->getTotTime(comp_vid) - kern->getTotTime(base_vid)) / - kern->getTotTime(base_vid); - - string pfstring(pass); - if (pct_diff[ik][col] > run_params.getPFTolerance()) { - pfstring = fail; - } + VariantID vid = group.variants[gv]; - file << sepchr << setw(fom_col_width) << setprecision(prec) - <getVariantTuningIndex(vid, tuning_name); // - // Gather data for column summaries (unsigned). + // If kernel variant was run, generate data for it and + // print (signed) percentage difference from baseline. // - col_min[col] = min( col_min[col], pct_diff[ik][col] ); - col_max[col] = max( col_max[col], pct_diff[ik][col] ); - col_avg[col] += pct_diff[ik][col]; + if ( kern->wasVariantTuningRun(vid, tid) ) { + col_exec_count[ifg][col]++; - } else { // variant was not run, print a big fat goose egg... + bool is_base = (base_totTime == unknown_totTime); + if (is_base) { + base_totTime = kern->getTotTime(vid, tid); + } - file << sepchr <getTotTime(vid, tid) - base_totTime) / base_totTime; - } + string pfstring(pass); + if (pct_diff[ik][ifg][col] > run_params.getPFTolerance()) { + pfstring = fail; + } + if (is_base) { + pfstring = base; + } + + file << sepchr << setw(fom_col_width) << setprecision(prec) + <& fom_groups) // // Column average... - for (size_t col = 0; col < ncols; ++col) { - if ( col_exec_count[col] > 0 ) { - col_avg[col] /= col_exec_count[col]; - } else { - col_avg[col] = 0.0; + for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { + for (size_t col = 0; col < fom_group_ncols[ifg]; ++col) { + if ( col_exec_count[ifg][col] > 0 ) { + col_avg[ifg][col] /= col_exec_count[ifg][col]; + } else { + col_avg[ifg][col] = 0.0; + } } } - // Column standard deviaation... + // Column standard deviation... for (size_t ik = 0; ik < kernels.size(); ++ik) { KernelBase* kern = kernels[ik]; - int col = 0; for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { const FOMGroup& group = fom_groups[ifg]; + int col = 0; for (size_t gv = 0; gv < group.variants.size(); ++gv) { - VariantID comp_vid = group.variants[gv]; + VariantID vid = group.variants[gv]; - if ( kern->wasVariantRun(comp_vid) ) { - col_stddev[col] += ( pct_diff[ik][col] - col_avg[col] ) * - ( pct_diff[ik][col] - col_avg[col] ); - } + for (const string& tuning_name : tuning_names[vid]) { + + size_t tid = kern->getVariantTuningIndex(vid, tuning_name); + + if ( kern->wasVariantTuningRun(vid, tid) ) { + col_stddev[ifg][col] += ( pct_diff[ik][ifg][col] - col_avg[ifg][col] ) * + ( pct_diff[ik][ifg][col] - col_avg[ifg][col] ); + } - col++; + col++; + } } // loop over group variants @@ -1154,11 +1235,13 @@ void Executor::writeFOMReport(ostream& file, vector& fom_groups) } // loop over kernels - for (size_t col = 0; col < ncols; ++col) { - if ( col_exec_count[col] > 0 ) { - col_stddev[col] /= col_exec_count[col]; - } else { - col_stddev[col] = 0.0; + for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { + for (size_t col = 0; col < fom_group_ncols[ifg]; ++col) { + if ( col_exec_count[ifg][col] > 0 ) { + col_stddev[ifg][col] /= col_exec_count[ifg][col]; + } else { + col_stddev[ifg][col] = 0.0; + } } } @@ -1166,36 +1249,46 @@ void Executor::writeFOMReport(ostream& file, vector& fom_groups) // Print column summaries. // file <getName().size()); - } - for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - namecol_width = max(namecol_width, - getVariantName(variant_ids[iv]).size()); + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + size_t var_width = getVariantName(variant_ids[iv]).size(); + for (std::string const& tuning_name : + kernels[ik]->getVariantTuningNames(variant_ids[iv])) { + namecol_width = max(namecol_width, var_width+1+tuning_name.size()); + } + } } namecol_width++; @@ -1285,22 +1381,30 @@ void Executor::writeChecksumReport(ostream& file) bool found_ref = false; while ( ivck < variant_ids.size() && !found_ref ) { VariantID vid = variant_ids[ivck]; - if ( kern->wasVariantRun(vid) ) { - cksum_ref = kern->getChecksum(vid); - found_ref = true; + size_t num_tunings = kern->getNumVariantTunings(vid); + for (size_t tid = 0; tid < num_tunings; ++tid) { + if ( kern->wasVariantTuningRun(vid, tid) ) { + cksum_ref = kern->getChecksum(vid, tid); + found_ref = true; + break; + } } ++ivck; } // get vector of checksums and diffs - std::vector checksums(variant_ids.size(), 0.0); - std::vector checksums_diff(variant_ids.size(), 0.0); + std::vector> checksums(variant_ids.size()); + std::vector> checksums_diff(variant_ids.size()); for (size_t iv = 0; iv < variant_ids.size(); ++iv) { VariantID vid = variant_ids[iv]; + size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]); - if ( kern->wasVariantRun(vid) ) { - checksums[iv] = kern->getChecksum(vid); - checksums_diff[iv] = cksum_ref - kern->getChecksum(vid); + checksums[iv].resize(num_tunings, 0.0); + for (size_t tid = 0; tid < num_tunings; ++tid) { + if ( kern->wasVariantTuningRun(vid, tid) ) { + checksums[iv][tid] = kern->getChecksum(vid, tid); + checksums_diff[iv][tid] = cksum_ref - kern->getChecksum(vid, tid); + } } } @@ -1310,78 +1414,114 @@ void Executor::writeChecksumReport(ostream& file) } // get stats for checksums - std::vector checksums_sum(variant_ids.size(), 0.0); - MPI_Allreduce(checksums.data(), checksums_sum.data(), variant_ids.size(), - Checksum_MPI_type, MPI_SUM, MPI_COMM_WORLD); + std::vector> checksums_sum(variant_ids.size()); + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]); + checksums_sum[iv].resize(num_tunings, 0.0); + MPI_Allreduce(checksums[iv].data(), checksums_sum[iv].data(), num_tunings, + Checksum_MPI_type, MPI_SUM, MPI_COMM_WORLD); + } - std::vector checksums_avg(variant_ids.size(), 0.0); + std::vector> checksums_avg(variant_ids.size()); for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - checksums_avg[iv] = checksums_sum[iv] / num_ranks; + size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]); + checksums_avg[iv].resize(num_tunings, 0.0); + for (size_t tid = 0; tid < num_tunings; ++tid) { + checksums_avg[iv][tid] = checksums_sum[iv][tid] / num_ranks; + } } // get stats for checksums_abs_diff - std::vector checksums_abs_diff(variant_ids.size(), 0.0); + std::vector> checksums_abs_diff(variant_ids.size()); for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - checksums_abs_diff[iv] = std::abs(checksums_diff[iv]); + size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]); + checksums_abs_diff[iv].resize(num_tunings, 0.0); + for (size_t tid = 0; tid < num_tunings; ++tid) { + checksums_abs_diff[iv][tid] = std::abs(checksums_diff[iv][tid]); + } } - std::vector checksums_abs_diff_min(variant_ids.size(), 0.0); - std::vector checksums_abs_diff_max(variant_ids.size(), 0.0); - std::vector checksums_abs_diff_sum(variant_ids.size(), 0.0); - - MPI_Allreduce(checksums_abs_diff.data(), checksums_abs_diff_min.data(), variant_ids.size(), - Checksum_MPI_type, MPI_MIN, MPI_COMM_WORLD); - MPI_Allreduce(checksums_abs_diff.data(), checksums_abs_diff_max.data(), variant_ids.size(), - Checksum_MPI_type, MPI_MAX, MPI_COMM_WORLD); - MPI_Allreduce(checksums_abs_diff.data(), checksums_abs_diff_sum.data(), variant_ids.size(), - Checksum_MPI_type, MPI_SUM, MPI_COMM_WORLD); + std::vector> checksums_abs_diff_min(variant_ids.size()); + std::vector> checksums_abs_diff_max(variant_ids.size()); + std::vector> checksums_abs_diff_sum(variant_ids.size()); + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]); + checksums_abs_diff_min[iv].resize(num_tunings, 0.0); + checksums_abs_diff_max[iv].resize(num_tunings, 0.0); + checksums_abs_diff_sum[iv].resize(num_tunings, 0.0); + + MPI_Allreduce(checksums_abs_diff[iv].data(), checksums_abs_diff_min[iv].data(), num_tunings, + Checksum_MPI_type, MPI_MIN, MPI_COMM_WORLD); + MPI_Allreduce(checksums_abs_diff[iv].data(), checksums_abs_diff_max[iv].data(), num_tunings, + Checksum_MPI_type, MPI_MAX, MPI_COMM_WORLD); + MPI_Allreduce(checksums_abs_diff[iv].data(), checksums_abs_diff_sum[iv].data(), num_tunings, + Checksum_MPI_type, MPI_SUM, MPI_COMM_WORLD); + } - std::vector checksums_abs_diff_avg(variant_ids.size(), 0.0); + std::vector> checksums_abs_diff_avg(variant_ids.size()); for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - checksums_abs_diff_avg[iv] = checksums_abs_diff_sum[iv] / num_ranks; + size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]); + checksums_abs_diff_avg[iv].resize(num_tunings, 0.0); + for (size_t tid = 0; tid < num_tunings; ++tid) { + checksums_abs_diff_avg[iv][tid] = checksums_abs_diff_sum[iv][tid] / num_ranks; + } } - std::vector checksums_abs_diff_diff2avg2(variant_ids.size(), 0.0); + std::vector> checksums_abs_diff_diff2avg2(variant_ids.size()); for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - checksums_abs_diff_diff2avg2[iv] = (checksums_abs_diff[iv] - checksums_abs_diff_avg[iv]) * - (checksums_abs_diff[iv] - checksums_abs_diff_avg[iv]) ; + size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]); + checksums_abs_diff_diff2avg2[iv].resize(num_tunings, 0.0); + for (size_t tid = 0; tid < num_tunings; ++tid) { + checksums_abs_diff_diff2avg2[iv][tid] = (checksums_abs_diff[iv][tid] - checksums_abs_diff_avg[iv][tid]) * + (checksums_abs_diff[iv][tid] - checksums_abs_diff_avg[iv][tid]) ; + } } - std::vector checksums_abs_diff_stddev(variant_ids.size(), 0.0); - MPI_Allreduce(checksums_abs_diff_diff2avg2.data(), checksums_abs_diff_stddev.data(), variant_ids.size(), - Checksum_MPI_type, MPI_SUM, MPI_COMM_WORLD); + std::vector> checksums_abs_diff_stddev(variant_ids.size()); for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - checksums_abs_diff_stddev[iv] = std::sqrt(checksums_abs_diff_stddev[iv] / num_ranks) ; + size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]); + checksums_abs_diff_stddev[iv].resize(num_tunings, 0.0); + MPI_Allreduce(checksums_abs_diff_diff2avg2.data(), checksums_abs_diff_stddev.data(), num_tunings, + Checksum_MPI_type, MPI_SUM, MPI_COMM_WORLD); + for (size_t tid = 0; tid < num_tunings; ++tid) { + checksums_abs_diff_stddev[iv][tid] = std::sqrt(checksums_abs_diff_stddev[iv][tid] / num_ranks); + } } #endif for (size_t iv = 0; iv < variant_ids.size(); ++iv) { VariantID vid = variant_ids[iv]; + const string& variant_name = getVariantName(vid); + + size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]); + for (size_t tid = 0; tid < num_tunings; ++tid) { + const string& tuning_name = kern->getVariantTuningName(vid, tid); - if ( kern->wasVariantRun(vid) ) { - file <wasVariantTuningRun(vid, tid) ) { + file <getTotTime(vid) / run_params.getNumPasses(); + retval = kern->getTotTime(vid, tid) / run_params.getNumPasses(); } break; case RunParams::CombinerOpt::Minimum : { - retval = kern->getMinTime(vid); + retval = kern->getMinTime(vid, tid); } break; case RunParams::CombinerOpt::Maximum : { - retval = kern->getMaxTime(vid); + retval = kern->getMaxTime(vid, tid); } break; default : { cout << "\n Unknown CSV combiner mode = " << combiner << endl; } @@ -1457,19 +1598,22 @@ long double Executor::getReportDataEntry(CSVRepMode mode, } case CSVRepMode::Speedup : { if ( haveReferenceVariant() ) { - if ( kern->hasVariantDefined(reference_vid) && - kern->hasVariantDefined(vid) ) { + if ( kern->hasVariantTuningDefined(reference_vid, reference_tid) && + kern->hasVariantTuningDefined(vid, tid) ) { switch ( combiner ) { case RunParams::CombinerOpt::Average : { - retval = kern->getTotTime(reference_vid) / kern->getTotTime(vid); + retval = kern->getTotTime(reference_vid, reference_tid) / + kern->getTotTime(vid, tid); } break; case RunParams::CombinerOpt::Minimum : { - retval = kern->getMinTime(reference_vid) / kern->getMinTime(vid); + retval = kern->getMinTime(reference_vid, reference_tid) / + kern->getMinTime(vid, tid); } break; case RunParams::CombinerOpt::Maximum : { - retval = kern->getMaxTime(reference_vid) / kern->getMaxTime(vid); + retval = kern->getMaxTime(reference_vid, reference_tid) / + kern->getMaxTime(vid, tid); } break; default : { cout << "\n Unknown CSV combiner mode = " << combiner << endl; } @@ -1478,10 +1622,11 @@ long double Executor::getReportDataEntry(CSVRepMode mode, retval = 0.0; } #if 0 // RDH DEBUG (leave this here, it's useful for debugging!) - getCout() << "Kernel(iv): " << kern->getName() << "(" << vid << ")" << endl; + getCout() << "Kernel(iv): " << kern->getName() << "(" << vid << ")" + << "(" << tid << ")"endl; getCout() << "\tref_time, tot_time, retval = " - << kern->getTotTime(reference_vid) << " , " - << kern->getTotTime(vid) << " , " + << kern->getTotTime(reference_vid, reference_tid) << " , " + << kern->getTotTime(vid, tid) << " , " << retval << endl; #endif } @@ -1503,7 +1648,7 @@ void Executor::getFOMGroups(vector& fom_groups) if ( vname.find("Base") != string::npos ) { FOMGroup group; - group.base = vid; + group.variants.push_back(vid); string::size_type pos = vname.find("_"); string pm(vname.substr(pos+1, string::npos)); diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp index 521a81596..39308e443 100644 --- a/src/common/Executor.hpp +++ b/src/common/Executor.hpp @@ -26,7 +26,8 @@ class WarmupKernel; /*! ******************************************************************************* * - * \brief Class that assembles kernels and variants to run and executes them. + * \brief Class that assembles kernels, variants, and tunings to run and + * executes them. * ******************************************************************************* */ @@ -56,7 +57,6 @@ class Executor }; struct FOMGroup { - VariantID base; std::vector variants; }; @@ -75,7 +75,7 @@ class Executor RunParams::CombinerOpt combiner, size_t prec); std::string getReportTitle(CSVRepMode mode, RunParams::CombinerOpt combiner); long double getReportDataEntry(CSVRepMode mode, RunParams::CombinerOpt combiner, - KernelBase* kern, VariantID vid); + KernelBase* kern, VariantID vid, size_t tid); void writeChecksumReport(std::ostream& file); @@ -85,8 +85,10 @@ class Executor RunParams run_params; std::vector kernels; std::vector variant_ids; + std::vector tuning_names[NumVariants]; VariantID reference_vid; + size_t reference_tid; }; } // closing brace for rajaperf namespace diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index d6bacb890..d4f011bd7 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -243,23 +243,6 @@ struct ExactSqrt static constexpr bool valid() { return sqrt(I)*sqrt(I) == I; } }; -// helper function to get the Nth value from a camp int pack: -template -static constexpr size_t GetN(camp::int_seq sizes){ -return camp::integral_constant(std::array { I... }) >(); -} -//compile time loop over an integer sequence -//this allows for creating a loop over a compile time constant variable -template -static void seq_for(camp::int_seq, Func func) { - static_cast(camp::sink((f(camp::integral_constant{}), 0)...)); -} -template -static void seq_for(Func func) { - seq_for(camp::int_seq{}, func); -} - // A camp::int_seq of size_t's that is rajaperf::configuration::gpu_block_sizes // if rajaperf::configuration::gpu_block_sizes is not empty // and a camp::int_seq of default_block_size otherwise @@ -275,6 +258,19 @@ using list_type = } // closing brace for gpu_block_size namespace +//compile time loop over an integer sequence +//this allows for creating a loop over a compile time constant variable +template +void seq_for(camp::int_seq, Func&& func) +{ + camp::sink((func(camp::integral_constant{}), 0)...); +} +template +void seq_for(Func&& func) +{ + seq_for(camp::make_int_seq_t{}, std::forward(func)); +} + } // closing brace for rajaperf namespace #endif // closing endif for header file include guard diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 2e5ee7254..518c1d9b8 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -11,6 +11,7 @@ #include "RunParams.hpp" #include +#include namespace rajaperf { @@ -22,35 +23,22 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params) : default_prob_size = -1; default_reps = -1; - default_gpu_block_size = 0; actual_prob_size = -1; - actual_gpu_block_size = 0; for (size_t fid = 0; fid < NumFeatures; ++fid) { uses_feature[fid] = false; } - for (size_t vid = 0; vid < NumVariants; ++vid) { - has_variant_defined[vid] = false; - } - its_per_rep = -1; kernels_per_rep = -1; bytes_per_rep = -1; FLOPs_per_rep = -1; running_variant = NumVariants; + running_tuning = std::numeric_limits::max(); checksum_scale_factor = 1.0; - - for (size_t vid = 0; vid < NumVariants; ++vid) { - checksum[vid] = 0.0; - num_exec[vid] = 0; - min_time[vid] = std::numeric_limits::max(); - max_time[vid] = -std::numeric_limits::max(); - tot_time[vid] = 0.0; - } } @@ -84,42 +72,114 @@ Index_type KernelBase::getRunReps() const void KernelBase::setVariantDefined(VariantID vid) { - has_variant_defined[vid] = isVariantAvailable(vid) && - ( isVariantGPU(vid) ? isGPUBlockSizeSupported() - : true ); + if (!isVariantAvailable(vid)) return; + + switch ( vid ) { + + case Base_Seq : + { + setSeqTuningDefinitions(vid); + break; + } + + case Lambda_Seq : + case RAJA_Seq : + { +#if defined(RUN_RAJA_SEQ) + setSeqTuningDefinitions(vid); +#endif + break; + } + + case Base_OpenMP : + case Lambda_OpenMP : + case RAJA_OpenMP : + { +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + setOpenMPTuningDefinitions(vid); +#endif + break; + } + + case Base_OpenMPTarget : + case RAJA_OpenMPTarget : + { +#if defined(RAJA_ENABLE_TARGET_OPENMP) + setOpenMPTargetTuningDefinitions(vid); +#endif + break; + } + + case Base_CUDA : + case Lambda_CUDA : + case RAJA_CUDA : + { +#if defined(RAJA_ENABLE_CUDA) + setCudaTuningDefinitions(vid); +#endif + break; + } + + case Base_HIP : + case Lambda_HIP : + case RAJA_HIP : + { +#if defined(RAJA_ENABLE_HIP) + setHipTuningDefinitions(vid); +#endif + break; + } + + default : { +#if 0 + getCout() << "\n " << getName() + << " : Unknown variant id = " << vid << std::endl; +#endif + } + } + + checksum[vid].resize(variant_tuning_names[vid].size(), 0.0); + num_exec[vid].resize(variant_tuning_names[vid].size(), 0); + min_time[vid].resize(variant_tuning_names[vid].size(), std::numeric_limits::max()); + max_time[vid].resize(variant_tuning_names[vid].size(), -std::numeric_limits::max()); + tot_time[vid].resize(variant_tuning_names[vid].size(), 0.0); } -void KernelBase::execute(VariantID vid) +void KernelBase::execute(VariantID vid, size_t tid) { running_variant = vid; + running_tuning = tid; resetTimer(); resetDataInitCount(); this->setUp(vid); - this->runKernel(vid); + this->runKernel(vid, tid); - this->updateChecksum(vid); + this->updateChecksum(vid, tid); this->tearDown(vid); running_variant = NumVariants; + running_tuning = std::numeric_limits::max(); } void KernelBase::recordExecTime() { - num_exec[running_variant]++; + num_exec[running_variant].at(running_tuning)++; RAJA::Timer::ElapsedType exec_time = timer.elapsed(); - min_time[running_variant] = std::min(min_time[running_variant], exec_time); - max_time[running_variant] = std::max(max_time[running_variant], exec_time); - tot_time[running_variant] += exec_time; + min_time[running_variant].at(running_tuning) = + std::min(min_time[running_variant].at(running_tuning), exec_time); + max_time[running_variant].at(running_tuning) = + std::max(max_time[running_variant].at(running_tuning), exec_time); + tot_time[running_variant].at(running_tuning) += exec_time; } -void KernelBase::runKernel(VariantID vid) +void KernelBase::runKernel(VariantID vid, size_t tid) { - if ( !has_variant_defined[vid] ) { + if ( !hasVariantDefined(vid) ) { return; } @@ -127,7 +187,7 @@ void KernelBase::runKernel(VariantID vid) case Base_Seq : { - runSeqVariant(vid); + runSeqVariant(vid, tid); break; } @@ -135,7 +195,7 @@ void KernelBase::runKernel(VariantID vid) case RAJA_Seq : { #if defined(RUN_RAJA_SEQ) - runSeqVariant(vid); + runSeqVariant(vid, tid); #endif break; } @@ -145,7 +205,7 @@ void KernelBase::runKernel(VariantID vid) case RAJA_OpenMP : { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - runOpenMPVariant(vid); + runOpenMPVariant(vid, tid); #endif break; } @@ -154,7 +214,7 @@ void KernelBase::runKernel(VariantID vid) case RAJA_OpenMPTarget : { #if defined(RAJA_ENABLE_TARGET_OPENMP) - runOpenMPTargetVariant(vid); + runOpenMPTargetVariant(vid, tid); #endif break; } @@ -164,7 +224,7 @@ void KernelBase::runKernel(VariantID vid) case RAJA_CUDA : { #if defined(RAJA_ENABLE_CUDA) - runCudaVariant(vid); + runCudaVariant(vid, tid); #endif break; } @@ -174,7 +234,7 @@ void KernelBase::runKernel(VariantID vid) case RAJA_HIP : { #if defined(RAJA_ENABLE_HIP) - runHipVariant(vid); + runHipVariant(vid, tid); #endif break; } @@ -201,10 +261,14 @@ void KernelBase::print(std::ostream& os) const os << "\t\t\t\t" << getFeatureName(static_cast(j)) << " : " << uses_feature[j] << std::endl; } - os << "\t\t\t has_variant_defined: " << std::endl; + os << "\t\t\t variant_tuning_names: " << std::endl; for (unsigned j = 0; j < NumVariants; ++j) { os << "\t\t\t\t" << getVariantName(static_cast(j)) - << " : " << has_variant_defined[j] << std::endl; + << " :" << std::endl; + for (size_t t = 0; t < variant_tuning_names[j].size(); ++t) { + os << "\t\t\t\t\t" << getTuningName(static_cast(j), t) + << std::endl; + } } os << "\t\t\t its_per_rep = " << its_per_rep << std::endl; os << "\t\t\t kernels_per_rep = " << kernels_per_rep << std::endl; @@ -213,27 +277,42 @@ void KernelBase::print(std::ostream& os) const os << "\t\t\t num_exec: " << std::endl; for (unsigned j = 0; j < NumVariants; ++j) { os << "\t\t\t\t" << getVariantName(static_cast(j)) - << " : " << num_exec[j] << std::endl; + << " :" << std::endl; + for (size_t t = 0; t < num_exec[j].size(); ++t) { + os << "\t\t\t\t\t" << num_exec[j][t] << std::endl; + } } os << "\t\t\t min_time: " << std::endl; for (unsigned j = 0; j < NumVariants; ++j) { os << "\t\t\t\t" << getVariantName(static_cast(j)) - << " : " << min_time[j] << std::endl; + << " :" << std::endl; + for (size_t t = 0; t < min_time[j].size(); ++t) { + os << "\t\t\t\t\t" << min_time[j][t] << std::endl; + } } os << "\t\t\t max_time: " << std::endl; for (unsigned j = 0; j < NumVariants; ++j) { os << "\t\t\t\t" << getVariantName(static_cast(j)) - << " : " << max_time[j] << std::endl; + << " :" << std::endl; + for (size_t t = 0; t < max_time[j].size(); ++t) { + os << "\t\t\t\t\t" << max_time[j][t] << std::endl; + } } os << "\t\t\t tot_time: " << std::endl; for (unsigned j = 0; j < NumVariants; ++j) { os << "\t\t\t\t" << getVariantName(static_cast(j)) - << " : " << tot_time[j] << std::endl; + << " :" << std::endl; + for (size_t t = 0; t < tot_time[j].size(); ++t) { + os << "\t\t\t\t\t" << tot_time[j][t] << std::endl; + } } os << "\t\t\t checksum: " << std::endl; for (unsigned j = 0; j < NumVariants; ++j) { os << "\t\t\t\t" << getVariantName(static_cast(j)) - << " : " << checksum[j] << std::endl; + << " :" << std::endl; + for (size_t t = 0; t < checksum[j].size(); ++t) { + os << "\t\t\t\t\t" << checksum[j][t] << std::endl; + } } os << std::endl; } diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index f8f311b34..6e4b63feb 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -27,6 +27,7 @@ #endif #include +#include #include #include @@ -56,8 +57,6 @@ class KernelBase void setDefaultProblemSize(Index_type size) { default_prob_size = size; } void setActualProblemSize(Index_type size) { actual_prob_size = size; } - void setDefaultGPUBlockSize(size_t size) { default_gpu_block_size = size; } - void setActualGPUBlockSize(size_t size) { actual_gpu_block_size = size; } void setDefaultReps(Index_type reps) { default_reps = reps; } void setItsPerRep(Index_type its) { its_per_rep = its; }; void setKernelsPerRep(Index_type nkerns) { kernels_per_rep = nkerns; }; @@ -66,6 +65,27 @@ class KernelBase void setUsesFeature(FeatureID fid) { uses_feature[fid] = true; } void setVariantDefined(VariantID vid); + void addVariantTuningName(VariantID vid, std::string name) + { variant_tuning_names[vid].emplace_back(std::move(name)); } + + virtual void setSeqTuningDefinitions(VariantID vid) + { addVariantTuningName(vid, "default"); } +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + virtual void setOpenMPTuningDefinitions(VariantID vid) + { addVariantTuningName(vid, "default"); } +#endif +#if defined(RAJA_ENABLE_CUDA) + virtual void setCudaTuningDefinitions(VariantID vid) + { addVariantTuningName(vid, "default"); } +#endif +#if defined(RAJA_ENABLE_HIP) + virtual void setHipTuningDefinitions(VariantID vid) + { addVariantTuningName(vid, "default"); } +#endif +#if defined(RAJA_ENABLE_TARGET_OPENMP) + virtual void setOpenMPTargetTuningDefinitions(VariantID vid) + { addVariantTuningName(vid, "default"); } +#endif // // Getter methods used to generate kernel execution summary @@ -74,8 +94,6 @@ class KernelBase Index_type getDefaultProblemSize() const { return default_prob_size; } Index_type getActualProblemSize() const { return actual_prob_size; } - size_t getDefaultGPUBlockSize() const { return default_gpu_block_size; } - size_t getActualGPUBlockSize() const { return actual_gpu_block_size; } Index_type getDefaultReps() const { return default_reps; } Index_type getItsPerRep() const { return its_per_rep; }; Index_type getKernelsPerRep() const { return kernels_per_rep; }; @@ -88,35 +106,56 @@ class KernelBase bool usesFeature(FeatureID fid) const { return uses_feature[fid]; }; bool hasVariantDefined(VariantID vid) const - { return has_variant_defined[vid]; } - - std::string getVariantName(VariantID vid) const - { - if (isVariantGPU(vid) && actual_gpu_block_size > 0) { - return rajaperf::getVariantName(vid) + std::string("_") + std::to_string(actual_gpu_block_size); - } else { - return rajaperf::getVariantName(vid); + { return !variant_tuning_names[vid].empty(); } + bool hasVariantTuningDefined(VariantID vid, size_t tid) const + { + if (hasVariantDefined(vid) && tid < getNumVariantTunings(vid)) { + return true; + } + return false; } - } - - virtual bool isGPUBlockSizeSupported() const - { - return default_gpu_block_size == actual_gpu_block_size; - } + bool hasVariantTuningDefined(VariantID vid, std::string const& tuning_name) const + { + if (hasVariantDefined(vid)) { + for (std::string const& a_tuning_name : getVariantTuningNames(vid)) { + if (tuning_name == a_tuning_name) { return true; } + } + } + return false; + } + size_t getVariantTuningIndex(VariantID vid, std::string const& tuning_name) const + { + size_t t = 0; + for (std::string const& a_tuning_name : getVariantTuningNames(vid)) { + if (tuning_name == a_tuning_name) { return t; } + } + return std::numeric_limits::max(); + } + size_t getNumVariantTunings(VariantID vid) const + { return variant_tuning_names[vid].size(); } + std::string const& getVariantTuningName(VariantID vid, size_t tid) const + { return variant_tuning_names[vid].at(tid); } + std::vector const& getVariantTuningNames(VariantID vid) const + { return variant_tuning_names[vid]; } // // Methods to get information about kernel execution for reports // containing kernel execution information // - bool wasVariantRun(VariantID vid) const - { return num_exec[vid] > 0; } + bool wasVariantTuningRun(VariantID vid, size_t tid) const + { + if (tid != std::numeric_limits::max()) { + return num_exec[vid].at(tid) > 0; + } + return false; + } - double getMinTime(VariantID vid) const { return min_time[vid]; } - double getMaxTime(VariantID vid) const { return max_time[vid]; } - double getTotTime(VariantID vid) { return tot_time[vid]; } - Checksum_type getChecksum(VariantID vid) const { return checksum[vid]; } + double getMinTime(VariantID vid, size_t tid) const { return min_time[vid].at(tid); } + double getMaxTime(VariantID vid, size_t tid) const { return max_time[vid].at(tid); } + double getTotTime(VariantID vid, size_t tid) { return tot_time[vid].at(tid); } + Checksum_type getChecksum(VariantID vid, size_t tid) const { return checksum[vid].at(tid); } - void execute(VariantID vid); + void execute(VariantID vid, size_t tid); void synchronize() { @@ -163,30 +202,30 @@ class KernelBase virtual void print(std::ostream& os) const; - virtual void runKernel(VariantID vid); + virtual void runKernel(VariantID vid, size_t tid); virtual void setUp(VariantID vid) = 0; - virtual void updateChecksum(VariantID vid) = 0; + virtual void updateChecksum(VariantID vid, size_t tid) = 0; virtual void tearDown(VariantID vid) = 0; - virtual void runSeqVariant(VariantID vid) = 0; + virtual void runSeqVariant(VariantID vid, size_t tid) = 0; #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - virtual void runOpenMPVariant(VariantID vid) = 0; + virtual void runOpenMPVariant(VariantID vid, size_t tid) = 0; #endif #if defined(RAJA_ENABLE_CUDA) - virtual void runCudaVariant(VariantID vid) = 0; + virtual void runCudaVariant(VariantID vid, size_t tid) = 0; #endif #if defined(RAJA_ENABLE_HIP) - virtual void runHipVariant(VariantID vid) = 0; + virtual void runHipVariant(VariantID vid, size_t tid) = 0; #endif #if defined(RAJA_ENABLE_TARGET_OPENMP) - virtual void runOpenMPTargetVariant(VariantID vid) = 0; + virtual void runOpenMPTargetVariant(VariantID vid, size_t tid) = 0; #endif protected: const RunParams& run_params; - Checksum_type checksum[NumVariants]; + std::vector checksum[NumVariants]; Checksum_type checksum_scale_factor; private: @@ -202,14 +241,12 @@ class KernelBase Index_type default_prob_size; Index_type default_reps; - size_t default_gpu_block_size; Index_type actual_prob_size; - size_t actual_gpu_block_size; bool uses_feature[NumFeatures]; - bool has_variant_defined[NumVariants]; + std::vector variant_tuning_names[NumVariants]; // // Properties of kernel dependent on how kernel is run @@ -220,14 +257,15 @@ class KernelBase Index_type FLOPs_per_rep; VariantID running_variant; + size_t running_tuning; - int num_exec[NumVariants]; + std::vector num_exec[NumVariants]; RAJA::Timer timer; - RAJA::Timer::ElapsedType min_time[NumVariants]; - RAJA::Timer::ElapsedType max_time[NumVariants]; - RAJA::Timer::ElapsedType tot_time[NumVariants]; + std::vector min_time[NumVariants]; + std::vector max_time[NumVariants]; + std::vector tot_time[NumVariants]; }; } // closing brace for rajaperf namespace diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 09878c378..2cdeafc06 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -34,7 +34,7 @@ RunParams::RunParams(int argc, char** argv) size_meaning(SizeMeaning::Unset), size(0.0), size_factor(0.0), - gpu_block_size(0), + gpu_block_sizes(), pf_tol(0.1), checkrun_reps(1), reference_variant(), @@ -98,7 +98,10 @@ void RunParams::print(std::ostream& str) const str << "\n size_meaning = " << SizeMeaningToStr(getSizeMeaning()); str << "\n size = " << size; str << "\n size_factor = " << size_factor; - str << "\n gpu_block_size = " << gpu_block_size; + str << "\n gpu_block_sizes = "; + for (size_t j = 0; j < gpu_block_sizes.size(); ++j) { + str << "\n\t" << gpu_block_sizes[j]; + } str << "\n pf_tol = " << pf_tol; str << "\n checkrun_reps = " << checkrun_reps; str << "\n reference_variant = " << reference_variant; @@ -315,18 +318,31 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } else if ( opt == std::string("--gpu_block_size") ) { + bool got_someting = false; + bool done = false; i++; - if ( i < argc ) { - gpu_block_size = ::atoi( argv[i] ); - if ( gpu_block_size <= 0 ) { - std::cout << "\nBad input:" - << " must give --gpu_block_size a POSITIVE value (int)" - << std::endl; - input_state = BadInput; + while ( i < argc && !done ) { + opt = std::string(argv[i]); + if ( opt.at(0) == '-' ) { + i--; + done = true; + } else { + got_someting = true; + int gpu_block_size = ::atoi( opt ); + if ( gpu_block_size <= 0 ) { + std::cout << "\nBad input:" + << " must give --gpu_block_size POSITIVE values (int)" + << std::endl; + input_state = BadInput; + } else { + gpu_block_sizes.push_back(gpu_block_size); + } + ++i; } - } else { + } + if (!got_someting) { std::cout << "\nBad input:" - << " must give --gpu_block_size a value (int)" + << " must give --gpu_block_size one or more values (int)" << std::endl; input_state = BadInput; } @@ -574,12 +590,12 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t\t Example...\n" << "\t\t --size 1000000 (runs kernels with size ~1,000,000)\n\n"; - str << "\t --gpu_block_size [no default]\n" - << "\t (block size to run for all GPU kernels)\n" + str << "\t --gpu_block_size [no default]\n" + << "\t (block sizes to run for all GPU kernels)\n" << "\t (GPU kernels not supporting gpu_block_size will be skipped)\n" << "\t (Support is determined by kernel implementation and cmake variable RAJA_PERFSUITE_GPU_BLOCKSIZES)\n"; str << "\t\t Example...\n" - << "\t\t --gpu_block_size 256 (runs kernels with gpu_block_size 256)\n\n"; + << "\t\t --gpu_block_size 128 256 512 (runs kernels with gpu_block_size 128, 256, and 512)\n\n"; str << "\t --pass-fail-tol, -pftol [default is 0.1; i.e., 10%]\n" << "\t (slowdown tolerance for RAJA vs. Base variants in FOM report)\n"; diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index 6e5897164..4856e20ba 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -125,8 +125,16 @@ class RunParams { double getSizeFactor() const { return size_factor; } - size_t getGPUBlockSize() const { return gpu_block_size; } - void setGPUBlockSize(size_t block_size) { gpu_block_size = block_size; } + size_t numValidGPUBlockSize() const { return gpu_block_sizes.size(); } + bool validGPUBlockSize(size_t block_size) const + { + for (size_t valid_block_size : gpu_block_sizes) { + if (valid_block_size == block_size) { + return true; + } + } + return false; + } double getPFTolerance() const { return pf_tol; } @@ -224,8 +232,7 @@ class RunParams { SizeMeaning size_meaning; /*!< meaning of size value */ double size; /*!< kernel size to run (input option) */ double size_factor; /*!< default kernel size multipier (input option) */ - - size_t gpu_block_size; /*!< GPU block size to run (input option) */ + std::vector gpu_block_sizes; /*!< Block sizes for gpu tunings to run (input option) */ double pf_tol; /*!< pct RAJA variant run time can exceed base for each PM case to pass/fail acceptance */ From f20720e99571997d91af95217468a8975e33428e Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 23 Feb 2022 10:52:36 -0800 Subject: [PATCH 212/392] Add tid arg to methods --- src/algorithm/SORT-Cuda.cpp | 2 +- src/algorithm/SORT-Hip.cpp | 2 +- src/algorithm/SORT-OMP.cpp | 2 +- src/algorithm/SORT-Seq.cpp | 2 +- src/algorithm/SORT.cpp | 10 +++----- src/algorithm/SORT.hpp | 18 +++++++-------- src/algorithm/SORTPAIRS-Cuda.cpp | 2 +- src/algorithm/SORTPAIRS-Hip.cpp | 2 +- src/algorithm/SORTPAIRS-OMP.cpp | 2 +- src/algorithm/SORTPAIRS-Seq.cpp | 2 +- src/algorithm/SORTPAIRS.cpp | 10 +++----- src/algorithm/SORTPAIRS.hpp | 18 +++++++-------- src/apps/DEL_DOT_VEC_2D-Cuda.cpp | 2 +- src/apps/DEL_DOT_VEC_2D-Hip.cpp | 2 +- src/apps/DEL_DOT_VEC_2D-OMP.cpp | 2 +- src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp | 2 +- src/apps/DEL_DOT_VEC_2D-Seq.cpp | 2 +- src/apps/DEL_DOT_VEC_2D.cpp | 10 +++----- src/apps/DEL_DOT_VEC_2D.hpp | 23 ++++++++++--------- src/apps/DIFFUSION3DPA-Cuda.cpp | 2 +- src/apps/DIFFUSION3DPA-Hip.cpp | 2 +- src/apps/DIFFUSION3DPA-OMP.cpp | 2 +- src/apps/DIFFUSION3DPA-OMPTarget.cpp | 2 +- src/apps/DIFFUSION3DPA-Seq.cpp | 2 +- src/apps/DIFFUSION3DPA.cpp | 10 +++----- src/apps/DIFFUSION3DPA.hpp | 18 +++++++-------- src/apps/ENERGY-Cuda.cpp | 2 +- src/apps/ENERGY-Hip.cpp | 2 +- src/apps/ENERGY-OMP.cpp | 2 +- src/apps/ENERGY-OMPTarget.cpp | 2 +- src/apps/ENERGY-Seq.cpp | 2 +- src/apps/ENERGY.cpp | 10 +++----- src/apps/ENERGY.hpp | 23 ++++++++++--------- src/apps/FIR-Cuda.cpp | 2 +- src/apps/FIR-Hip.cpp | 2 +- src/apps/FIR-OMP.cpp | 2 +- src/apps/FIR-OMPTarget.cpp | 2 +- src/apps/FIR-Seq.cpp | 2 +- src/apps/FIR.cpp | 10 +++----- src/apps/FIR.hpp | 23 ++++++++++--------- src/apps/HALOEXCHANGE-Cuda.cpp | 2 +- src/apps/HALOEXCHANGE-Hip.cpp | 2 +- src/apps/HALOEXCHANGE-OMP.cpp | 2 +- src/apps/HALOEXCHANGE-OMPTarget.cpp | 2 +- src/apps/HALOEXCHANGE-Seq.cpp | 2 +- src/apps/HALOEXCHANGE.cpp | 10 +++----- src/apps/HALOEXCHANGE.hpp | 23 ++++++++++--------- src/apps/HALOEXCHANGE_FUSED-Cuda.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED-Hip.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED-OMP.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED-Seq.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED.cpp | 10 +++----- src/apps/HALOEXCHANGE_FUSED.hpp | 23 ++++++++++--------- src/apps/LTIMES-Cuda.cpp | 2 +- src/apps/LTIMES-Hip.cpp | 2 +- src/apps/LTIMES-OMP.cpp | 2 +- src/apps/LTIMES-OMPTarget.cpp | 2 +- src/apps/LTIMES-Seq.cpp | 2 +- src/apps/LTIMES.cpp | 10 +++----- src/apps/LTIMES.hpp | 23 ++++++++++--------- src/apps/LTIMES_NOVIEW-Cuda.cpp | 2 +- src/apps/LTIMES_NOVIEW-Hip.cpp | 2 +- src/apps/LTIMES_NOVIEW-OMP.cpp | 2 +- src/apps/LTIMES_NOVIEW-OMPTarget.cpp | 2 +- src/apps/LTIMES_NOVIEW-Seq.cpp | 2 +- src/apps/LTIMES_NOVIEW.cpp | 10 +++----- src/apps/LTIMES_NOVIEW.hpp | 23 ++++++++++--------- src/apps/MASS3DPA-Cuda.cpp | 2 +- src/apps/MASS3DPA-Hip.cpp | 2 +- src/apps/MASS3DPA-OMP.cpp | 2 +- src/apps/MASS3DPA-OMPTarget.cpp | 2 +- src/apps/MASS3DPA-Seq.cpp | 2 +- src/apps/MASS3DPA.cpp | 10 +++----- src/apps/MASS3DPA.hpp | 18 +++++++-------- src/apps/PRESSURE-Cuda.cpp | 2 +- src/apps/PRESSURE-Hip.cpp | 2 +- src/apps/PRESSURE-OMP.cpp | 2 +- src/apps/PRESSURE-OMPTarget.cpp | 2 +- src/apps/PRESSURE-Seq.cpp | 2 +- src/apps/PRESSURE.cpp | 10 +++----- src/apps/PRESSURE.hpp | 23 ++++++++++--------- src/apps/VOL3D-Cuda.cpp | 2 +- src/apps/VOL3D-Hip.cpp | 2 +- src/apps/VOL3D-OMP.cpp | 2 +- src/apps/VOL3D-OMPTarget.cpp | 2 +- src/apps/VOL3D-Seq.cpp | 2 +- src/apps/VOL3D.cpp | 10 +++----- src/apps/VOL3D.hpp | 23 ++++++++++--------- src/apps/WIP-COUPLE.cpp | 6 ++--- src/apps/WIP-COUPLE.hpp | 18 +++++++-------- src/basic/DAXPY-Hip.cpp | 2 +- src/basic/DAXPY-OMP.cpp | 2 +- src/basic/DAXPY-OMPTarget.cpp | 2 +- src/basic/DAXPY-Seq.cpp | 2 +- src/basic/DAXPY.cpp | 4 ++-- src/basic/DAXPY.hpp | 5 ++-- src/basic/IF_QUAD-Cuda.cpp | 2 +- src/basic/IF_QUAD-Hip.cpp | 2 +- src/basic/IF_QUAD-OMP.cpp | 2 +- src/basic/IF_QUAD-OMPTarget.cpp | 2 +- src/basic/IF_QUAD-Seq.cpp | 2 +- src/basic/IF_QUAD.cpp | 10 +++----- src/basic/IF_QUAD.hpp | 23 ++++++++++--------- src/basic/INIT3-Cuda.cpp | 2 +- src/basic/INIT3-Hip.cpp | 2 +- src/basic/INIT3-OMP.cpp | 2 +- src/basic/INIT3-OMPTarget.cpp | 2 +- src/basic/INIT3-Seq.cpp | 2 +- src/basic/INIT3.cpp | 10 +++----- src/basic/INIT3.hpp | 23 ++++++++++--------- src/basic/INIT_VIEW1D-Cuda.cpp | 2 +- src/basic/INIT_VIEW1D-Hip.cpp | 2 +- src/basic/INIT_VIEW1D-OMP.cpp | 2 +- src/basic/INIT_VIEW1D-OMPTarget.cpp | 2 +- src/basic/INIT_VIEW1D-Seq.cpp | 2 +- src/basic/INIT_VIEW1D.cpp | 10 +++----- src/basic/INIT_VIEW1D.hpp | 23 ++++++++++--------- src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-Hip.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-OMP.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-Seq.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET.cpp | 10 +++----- src/basic/INIT_VIEW1D_OFFSET.hpp | 23 ++++++++++--------- src/basic/MAT_MAT_SHARED-Cuda.cpp | 2 +- src/basic/MAT_MAT_SHARED-Hip.cpp | 2 +- src/basic/MAT_MAT_SHARED-OMP.cpp | 2 +- src/basic/MAT_MAT_SHARED-OMPTarget.cpp | 2 +- src/basic/MAT_MAT_SHARED-Seq.cpp | 2 +- src/basic/MAT_MAT_SHARED.cpp | 10 +++----- src/basic/MAT_MAT_SHARED.hpp | 23 ++++++++++--------- src/basic/MULADDSUB-Cuda.cpp | 2 +- src/basic/MULADDSUB-Hip.cpp | 2 +- src/basic/MULADDSUB-OMP.cpp | 2 +- src/basic/MULADDSUB-OMPTarget.cpp | 2 +- src/basic/MULADDSUB-Seq.cpp | 2 +- src/basic/MULADDSUB.cpp | 10 +++----- src/basic/MULADDSUB.hpp | 23 ++++++++++--------- src/basic/NESTED_INIT-Cuda.cpp | 2 +- src/basic/NESTED_INIT-Hip.cpp | 2 +- src/basic/NESTED_INIT-OMP.cpp | 2 +- src/basic/NESTED_INIT-OMPTarget.cpp | 2 +- src/basic/NESTED_INIT-Seq.cpp | 2 +- src/basic/NESTED_INIT.cpp | 10 +++----- src/basic/NESTED_INIT.hpp | 23 ++++++++++--------- src/basic/PI_ATOMIC-Cuda.cpp | 2 +- src/basic/PI_ATOMIC-Hip.cpp | 2 +- src/basic/PI_ATOMIC-OMP.cpp | 2 +- src/basic/PI_ATOMIC-OMPTarget.cpp | 2 +- src/basic/PI_ATOMIC-Seq.cpp | 2 +- src/basic/PI_ATOMIC.cpp | 10 +++----- src/basic/PI_ATOMIC.hpp | 23 ++++++++++--------- src/basic/PI_REDUCE-Cuda.cpp | 2 +- src/basic/PI_REDUCE-Hip.cpp | 2 +- src/basic/PI_REDUCE-OMP.cpp | 2 +- src/basic/PI_REDUCE-OMPTarget.cpp | 2 +- src/basic/PI_REDUCE-Seq.cpp | 2 +- src/basic/PI_REDUCE.cpp | 10 +++----- src/basic/PI_REDUCE.hpp | 23 ++++++++++--------- src/basic/REDUCE3_INT-Cuda.cpp | 2 +- src/basic/REDUCE3_INT-Hip.cpp | 2 +- src/basic/REDUCE3_INT-OMP.cpp | 2 +- src/basic/REDUCE3_INT-OMPTarget.cpp | 2 +- src/basic/REDUCE3_INT-Seq.cpp | 2 +- src/basic/REDUCE3_INT.cpp | 10 +++----- src/basic/REDUCE3_INT.hpp | 23 ++++++++++--------- src/basic/TRAP_INT-Cuda.cpp | 2 +- src/basic/TRAP_INT-Hip.cpp | 2 +- src/basic/TRAP_INT-OMP.cpp | 2 +- src/basic/TRAP_INT-OMPTarget.cpp | 2 +- src/basic/TRAP_INT-Seq.cpp | 2 +- src/basic/TRAP_INT.cpp | 10 +++----- src/basic/TRAP_INT.hpp | 23 ++++++++++--------- src/common/KernelBase.cpp | 4 ++-- src/common/KernelBase.hpp | 4 ++-- src/lcals/DIFF_PREDICT-Cuda.cpp | 2 +- src/lcals/DIFF_PREDICT-Hip.cpp | 2 +- src/lcals/DIFF_PREDICT-OMP.cpp | 2 +- src/lcals/DIFF_PREDICT-OMPTarget.cpp | 2 +- src/lcals/DIFF_PREDICT-Seq.cpp | 2 +- src/lcals/DIFF_PREDICT.cpp | 10 +++----- src/lcals/DIFF_PREDICT.hpp | 23 ++++++++++--------- src/lcals/EOS-Cuda.cpp | 2 +- src/lcals/EOS-Hip.cpp | 2 +- src/lcals/EOS-OMP.cpp | 2 +- src/lcals/EOS-OMPTarget.cpp | 2 +- src/lcals/EOS-Seq.cpp | 2 +- src/lcals/EOS.cpp | 10 +++----- src/lcals/EOS.hpp | 23 ++++++++++--------- src/lcals/FIRST_DIFF-Cuda.cpp | 2 +- src/lcals/FIRST_DIFF-Hip.cpp | 2 +- src/lcals/FIRST_DIFF-OMP.cpp | 2 +- src/lcals/FIRST_DIFF-OMPTarget.cpp | 2 +- src/lcals/FIRST_DIFF-Seq.cpp | 2 +- src/lcals/FIRST_DIFF.cpp | 10 +++----- src/lcals/FIRST_DIFF.hpp | 23 ++++++++++--------- src/lcals/FIRST_MIN-Cuda.cpp | 2 +- src/lcals/FIRST_MIN-Hip.cpp | 2 +- src/lcals/FIRST_MIN-OMP.cpp | 2 +- src/lcals/FIRST_MIN-OMPTarget.cpp | 2 +- src/lcals/FIRST_MIN-Seq.cpp | 2 +- src/lcals/FIRST_MIN.cpp | 10 +++----- src/lcals/FIRST_MIN.hpp | 23 ++++++++++--------- src/lcals/FIRST_SUM-Cuda.cpp | 2 +- src/lcals/FIRST_SUM-Hip.cpp | 2 +- src/lcals/FIRST_SUM-OMP.cpp | 2 +- src/lcals/FIRST_SUM-OMPTarget.cpp | 2 +- src/lcals/FIRST_SUM-Seq.cpp | 2 +- src/lcals/FIRST_SUM.cpp | 10 +++----- src/lcals/FIRST_SUM.hpp | 23 ++++++++++--------- src/lcals/GEN_LIN_RECUR-Cuda.cpp | 2 +- src/lcals/GEN_LIN_RECUR-Hip.cpp | 2 +- src/lcals/GEN_LIN_RECUR-OMP.cpp | 2 +- src/lcals/GEN_LIN_RECUR-OMPTarget.cpp | 2 +- src/lcals/GEN_LIN_RECUR-Seq.cpp | 2 +- src/lcals/GEN_LIN_RECUR.cpp | 10 +++----- src/lcals/GEN_LIN_RECUR.hpp | 23 ++++++++++--------- src/lcals/HYDRO_1D-Cuda.cpp | 2 +- src/lcals/HYDRO_1D-Hip.cpp | 2 +- src/lcals/HYDRO_1D-OMP.cpp | 2 +- src/lcals/HYDRO_1D-OMPTarget.cpp | 2 +- src/lcals/HYDRO_1D-Seq.cpp | 2 +- src/lcals/HYDRO_1D.cpp | 10 +++----- src/lcals/HYDRO_1D.hpp | 23 ++++++++++--------- src/lcals/HYDRO_2D-Cuda.cpp | 2 +- src/lcals/HYDRO_2D-Hip.cpp | 2 +- src/lcals/HYDRO_2D-OMP.cpp | 2 +- src/lcals/HYDRO_2D-OMPTarget.cpp | 2 +- src/lcals/HYDRO_2D-Seq.cpp | 2 +- src/lcals/HYDRO_2D.cpp | 10 +++----- src/lcals/HYDRO_2D.hpp | 23 ++++++++++--------- src/lcals/INT_PREDICT-Cuda.cpp | 2 +- src/lcals/INT_PREDICT-Hip.cpp | 2 +- src/lcals/INT_PREDICT-OMP.cpp | 2 +- src/lcals/INT_PREDICT-OMPTarget.cpp | 2 +- src/lcals/INT_PREDICT-Seq.cpp | 2 +- src/lcals/INT_PREDICT.cpp | 10 +++----- src/lcals/INT_PREDICT.hpp | 23 ++++++++++--------- src/lcals/PLANCKIAN-Cuda.cpp | 2 +- src/lcals/PLANCKIAN-Hip.cpp | 2 +- src/lcals/PLANCKIAN-OMP.cpp | 2 +- src/lcals/PLANCKIAN-OMPTarget.cpp | 2 +- src/lcals/PLANCKIAN-Seq.cpp | 2 +- src/lcals/PLANCKIAN.cpp | 10 +++----- src/lcals/PLANCKIAN.hpp | 23 ++++++++++--------- src/lcals/TRIDIAG_ELIM-Cuda.cpp | 2 +- src/lcals/TRIDIAG_ELIM-Hip.cpp | 2 +- src/lcals/TRIDIAG_ELIM-OMP.cpp | 2 +- src/lcals/TRIDIAG_ELIM-OMPTarget.cpp | 2 +- src/lcals/TRIDIAG_ELIM-Seq.cpp | 2 +- src/lcals/TRIDIAG_ELIM.cpp | 10 +++----- src/lcals/TRIDIAG_ELIM.hpp | 23 ++++++++++--------- src/polybench/POLYBENCH_2MM-Cuda.cpp | 2 +- src/polybench/POLYBENCH_2MM-Hip.cpp | 2 +- src/polybench/POLYBENCH_2MM-OMP.cpp | 2 +- src/polybench/POLYBENCH_2MM-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_2MM-Seq.cpp | 2 +- src/polybench/POLYBENCH_2MM.cpp | 10 +++----- src/polybench/POLYBENCH_2MM.hpp | 23 ++++++++++--------- src/polybench/POLYBENCH_3MM-Cuda.cpp | 2 +- src/polybench/POLYBENCH_3MM-Hip.cpp | 2 +- src/polybench/POLYBENCH_3MM-OMP.cpp | 2 +- src/polybench/POLYBENCH_3MM-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_3MM-Seq.cpp | 2 +- src/polybench/POLYBENCH_3MM.cpp | 10 +++----- src/polybench/POLYBENCH_3MM.hpp | 23 ++++++++++--------- src/polybench/POLYBENCH_ADI-Cuda.cpp | 2 +- src/polybench/POLYBENCH_ADI-Hip.cpp | 2 +- src/polybench/POLYBENCH_ADI-OMP.cpp | 2 +- src/polybench/POLYBENCH_ADI-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_ADI-Seq.cpp | 2 +- src/polybench/POLYBENCH_ADI.cpp | 10 +++----- src/polybench/POLYBENCH_ADI.hpp | 23 ++++++++++--------- src/polybench/POLYBENCH_ATAX-Cuda.cpp | 2 +- src/polybench/POLYBENCH_ATAX-Hip.cpp | 2 +- src/polybench/POLYBENCH_ATAX-OMP.cpp | 2 +- src/polybench/POLYBENCH_ATAX-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_ATAX-Seq.cpp | 2 +- src/polybench/POLYBENCH_ATAX.cpp | 10 +++----- src/polybench/POLYBENCH_ATAX.hpp | 23 ++++++++++--------- src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D-Hip.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D-OMP.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D-Seq.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D.cpp | 10 +++----- src/polybench/POLYBENCH_FDTD_2D.hpp | 23 ++++++++++--------- .../POLYBENCH_FLOYD_WARSHALL-Cuda.cpp | 2 +- .../POLYBENCH_FLOYD_WARSHALL-Hip.cpp | 2 +- .../POLYBENCH_FLOYD_WARSHALL-OMP.cpp | 2 +- .../POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp | 2 +- .../POLYBENCH_FLOYD_WARSHALL-Seq.cpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp | 10 +++----- src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp | 23 ++++++++++--------- src/polybench/POLYBENCH_GEMM-Cuda.cpp | 2 +- src/polybench/POLYBENCH_GEMM-Hip.cpp | 2 +- src/polybench/POLYBENCH_GEMM-OMP.cpp | 2 +- src/polybench/POLYBENCH_GEMM-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_GEMM-Seq.cpp | 2 +- src/polybench/POLYBENCH_GEMM.cpp | 10 +++----- src/polybench/POLYBENCH_GEMM.hpp | 23 ++++++++++--------- src/polybench/POLYBENCH_GEMVER-Cuda.cpp | 2 +- src/polybench/POLYBENCH_GEMVER-Hip.cpp | 2 +- src/polybench/POLYBENCH_GEMVER-OMP.cpp | 2 +- src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_GEMVER-Seq.cpp | 2 +- src/polybench/POLYBENCH_GEMVER.cpp | 10 +++----- src/polybench/POLYBENCH_GEMVER.hpp | 23 ++++++++++--------- src/polybench/POLYBENCH_GESUMMV-Cuda.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV-Hip.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV-OMP.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV-Seq.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV.cpp | 10 +++----- src/polybench/POLYBENCH_GESUMMV.hpp | 23 ++++++++++--------- src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D-Hip.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D-OMP.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D-Seq.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D.cpp | 10 +++----- src/polybench/POLYBENCH_HEAT_3D.hpp | 23 ++++++++++--------- src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp | 2 +- .../POLYBENCH_JACOBI_1D-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D.cpp | 10 +++----- src/polybench/POLYBENCH_JACOBI_1D.hpp | 23 ++++++++++--------- src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp | 2 +- .../POLYBENCH_JACOBI_2D-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D.cpp | 10 +++----- src/polybench/POLYBENCH_JACOBI_2D.hpp | 23 ++++++++++--------- src/polybench/POLYBENCH_MVT-Cuda.cpp | 2 +- src/polybench/POLYBENCH_MVT-Hip.cpp | 2 +- src/polybench/POLYBENCH_MVT-OMP.cpp | 2 +- src/polybench/POLYBENCH_MVT-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_MVT-Seq.cpp | 2 +- src/polybench/POLYBENCH_MVT.cpp | 10 +++----- src/polybench/POLYBENCH_MVT.hpp | 23 ++++++++++--------- src/stream/ADD-Cuda.cpp | 2 +- src/stream/ADD-Hip.cpp | 2 +- src/stream/ADD-OMP.cpp | 2 +- src/stream/ADD-OMPTarget.cpp | 2 +- src/stream/ADD-Seq.cpp | 2 +- src/stream/ADD.cpp | 10 +++----- src/stream/ADD.hpp | 23 ++++++++++--------- src/stream/COPY-Cuda.cpp | 2 +- src/stream/COPY-Hip.cpp | 2 +- src/stream/COPY-OMP.cpp | 2 +- src/stream/COPY-OMPTarget.cpp | 2 +- src/stream/COPY-Seq.cpp | 2 +- src/stream/COPY.cpp | 10 +++----- src/stream/COPY.hpp | 23 ++++++++++--------- src/stream/DOT-Cuda.cpp | 2 +- src/stream/DOT-Hip.cpp | 2 +- src/stream/DOT-OMP.cpp | 2 +- src/stream/DOT-OMPTarget.cpp | 2 +- src/stream/DOT-Seq.cpp | 2 +- src/stream/DOT.cpp | 10 +++----- src/stream/DOT.hpp | 23 ++++++++++--------- src/stream/MUL-Cuda.cpp | 2 +- src/stream/MUL-Hip.cpp | 2 +- src/stream/MUL-OMP.cpp | 2 +- src/stream/MUL-OMPTarget.cpp | 2 +- src/stream/MUL-Seq.cpp | 2 +- src/stream/MUL.cpp | 10 +++----- src/stream/MUL.hpp | 23 ++++++++++--------- src/stream/TRIAD-Cuda.cpp | 2 +- src/stream/TRIAD-Hip.cpp | 2 +- src/stream/TRIAD-OMP.cpp | 2 +- src/stream/TRIAD-OMPTarget.cpp | 2 +- src/stream/TRIAD-Seq.cpp | 2 +- src/stream/TRIAD.cpp | 10 +++----- src/stream/TRIAD.hpp | 23 ++++++++++--------- 379 files changed, 1071 insertions(+), 1233 deletions(-) diff --git a/src/algorithm/SORT-Cuda.cpp b/src/algorithm/SORT-Cuda.cpp index 5870e08b5..9ec7280ec 100644 --- a/src/algorithm/SORT-Cuda.cpp +++ b/src/algorithm/SORT-Cuda.cpp @@ -35,7 +35,7 @@ namespace algorithm deallocCudaDeviceData(x); -void SORT::runCudaVariant(VariantID vid) +void SORT::runCudaVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/algorithm/SORT-Hip.cpp b/src/algorithm/SORT-Hip.cpp index edf143cbc..a5b4030fb 100644 --- a/src/algorithm/SORT-Hip.cpp +++ b/src/algorithm/SORT-Hip.cpp @@ -35,7 +35,7 @@ namespace algorithm deallocHipDeviceData(x); -void SORT::runHipVariant(VariantID vid) +void SORT::runHipVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/algorithm/SORT-OMP.cpp b/src/algorithm/SORT-OMP.cpp index a83d956ed..dc2493474 100644 --- a/src/algorithm/SORT-OMP.cpp +++ b/src/algorithm/SORT-OMP.cpp @@ -18,7 +18,7 @@ namespace algorithm { -void SORT::runOpenMPVariant(VariantID vid) +void SORT::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/algorithm/SORT-Seq.cpp b/src/algorithm/SORT-Seq.cpp index 4f7094ba6..d20bf6334 100644 --- a/src/algorithm/SORT-Seq.cpp +++ b/src/algorithm/SORT-Seq.cpp @@ -18,7 +18,7 @@ namespace algorithm { -void SORT::runSeqVariant(VariantID vid) +void SORT::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/algorithm/SORT.cpp b/src/algorithm/SORT.cpp index db28e8cc4..8c2b5dbce 100644 --- a/src/algorithm/SORT.cpp +++ b/src/algorithm/SORT.cpp @@ -21,10 +21,6 @@ namespace algorithm SORT::SORT(const RunParams& params) : KernelBase(rajaperf::Algorithm_SORT, params) { - setDefaultGPUBlockSize( default_gpu_block_size ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(20); @@ -51,17 +47,17 @@ SORT::~SORT() { } -void SORT::setUp(VariantID vid) +void SORT::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataRandValue(m_x, getActualProblemSize()*getRunReps(), vid); } -void SORT::updateChecksum(VariantID vid) +void SORT::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_x, getActualProblemSize()*getRunReps()); } -void SORT::tearDown(VariantID vid) +void SORT::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_x); diff --git a/src/algorithm/SORT.hpp b/src/algorithm/SORT.hpp index ba4341ae4..4eb5909bb 100644 --- a/src/algorithm/SORT.hpp +++ b/src/algorithm/SORT.hpp @@ -42,15 +42,15 @@ class SORT : public KernelBase ~SORT(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid) + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid) { getCout() << "\n SORT : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/algorithm/SORTPAIRS-Cuda.cpp b/src/algorithm/SORTPAIRS-Cuda.cpp index aba1111dc..0ef9fb2f9 100644 --- a/src/algorithm/SORTPAIRS-Cuda.cpp +++ b/src/algorithm/SORTPAIRS-Cuda.cpp @@ -38,7 +38,7 @@ namespace algorithm deallocCudaDeviceData(i); -void SORTPAIRS::runCudaVariant(VariantID vid) +void SORTPAIRS::runCudaVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/algorithm/SORTPAIRS-Hip.cpp b/src/algorithm/SORTPAIRS-Hip.cpp index 0850ce650..51c411dcc 100644 --- a/src/algorithm/SORTPAIRS-Hip.cpp +++ b/src/algorithm/SORTPAIRS-Hip.cpp @@ -38,7 +38,7 @@ namespace algorithm deallocHipDeviceData(i); -void SORTPAIRS::runHipVariant(VariantID vid) +void SORTPAIRS::runHipVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/algorithm/SORTPAIRS-OMP.cpp b/src/algorithm/SORTPAIRS-OMP.cpp index 99a432931..630b1a20f 100644 --- a/src/algorithm/SORTPAIRS-OMP.cpp +++ b/src/algorithm/SORTPAIRS-OMP.cpp @@ -18,7 +18,7 @@ namespace algorithm { -void SORTPAIRS::runOpenMPVariant(VariantID vid) +void SORTPAIRS::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/algorithm/SORTPAIRS-Seq.cpp b/src/algorithm/SORTPAIRS-Seq.cpp index f1f9928b6..6ec0c3a4a 100644 --- a/src/algorithm/SORTPAIRS-Seq.cpp +++ b/src/algorithm/SORTPAIRS-Seq.cpp @@ -21,7 +21,7 @@ namespace algorithm { -void SORTPAIRS::runSeqVariant(VariantID vid) +void SORTPAIRS::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/algorithm/SORTPAIRS.cpp b/src/algorithm/SORTPAIRS.cpp index a4e7eabba..6a291bb5c 100644 --- a/src/algorithm/SORTPAIRS.cpp +++ b/src/algorithm/SORTPAIRS.cpp @@ -21,10 +21,6 @@ namespace algorithm SORTPAIRS::SORTPAIRS(const RunParams& params) : KernelBase(rajaperf::Algorithm_SORTPAIRS, params) { - setDefaultGPUBlockSize( default_gpu_block_size ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(20); @@ -51,19 +47,19 @@ SORTPAIRS::~SORTPAIRS() { } -void SORTPAIRS::setUp(VariantID vid) +void SORTPAIRS::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataRandValue(m_x, getActualProblemSize()*getRunReps(), vid); allocAndInitDataRandValue(m_i, getActualProblemSize()*getRunReps(), vid); } -void SORTPAIRS::updateChecksum(VariantID vid) +void SORTPAIRS::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_x, getActualProblemSize()*getRunReps()); checksum[vid] += calcChecksum(m_i, getActualProblemSize()*getRunReps()); } -void SORTPAIRS::tearDown(VariantID vid) +void SORTPAIRS::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_x); diff --git a/src/algorithm/SORTPAIRS.hpp b/src/algorithm/SORTPAIRS.hpp index 03ab9a8fd..1c4406141 100644 --- a/src/algorithm/SORTPAIRS.hpp +++ b/src/algorithm/SORTPAIRS.hpp @@ -41,15 +41,15 @@ class SORTPAIRS : public KernelBase ~SORTPAIRS(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid) + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid) { getCout() << "\n SORTPAIRS : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp index 1f4e198ec..2fb5a77f5 100644 --- a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp @@ -162,7 +162,7 @@ void DEL_DOT_VEC_2D::runCudaVariantImpl(VariantID vid) } } -void DEL_DOT_VEC_2D::runCudaVariant(VariantID vid) +void DEL_DOT_VEC_2D::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/apps/DEL_DOT_VEC_2D-Hip.cpp b/src/apps/DEL_DOT_VEC_2D-Hip.cpp index 02ce9f242..0b0d6bca5 100644 --- a/src/apps/DEL_DOT_VEC_2D-Hip.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Hip.cpp @@ -164,7 +164,7 @@ void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid) } } -void DEL_DOT_VEC_2D::runHipVariant(VariantID vid) +void DEL_DOT_VEC_2D::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/apps/DEL_DOT_VEC_2D-OMP.cpp b/src/apps/DEL_DOT_VEC_2D-OMP.cpp index 4232646ad..2b3c5b2e3 100644 --- a/src/apps/DEL_DOT_VEC_2D-OMP.cpp +++ b/src/apps/DEL_DOT_VEC_2D-OMP.cpp @@ -22,7 +22,7 @@ namespace apps { -void DEL_DOT_VEC_2D::runOpenMPVariant(VariantID vid) +void DEL_DOT_VEC_2D::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp index 9be35bbc6..424d940aa 100644 --- a/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp +++ b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp @@ -51,7 +51,7 @@ namespace apps deallocOpenMPDeviceData(real_zones, did); -void DEL_DOT_VEC_2D::runOpenMPTargetVariant(VariantID vid) +void DEL_DOT_VEC_2D::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/DEL_DOT_VEC_2D-Seq.cpp b/src/apps/DEL_DOT_VEC_2D-Seq.cpp index 208add00b..9be1cc853 100644 --- a/src/apps/DEL_DOT_VEC_2D-Seq.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Seq.cpp @@ -22,7 +22,7 @@ namespace apps { -void DEL_DOT_VEC_2D::runSeqVariant(VariantID vid) +void DEL_DOT_VEC_2D::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index 2c332f7ad..f75e44d4e 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -25,10 +25,6 @@ namespace apps DEL_DOT_VEC_2D::DEL_DOT_VEC_2D(const RunParams& params) : KernelBase(rajaperf::Apps_DEL_DOT_VEC_2D, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000*1000); // See rzmax in ADomain struct setDefaultReps(100); @@ -73,7 +69,7 @@ DEL_DOT_VEC_2D::~DEL_DOT_VEC_2D() delete m_domain; } -void DEL_DOT_VEC_2D::setUp(VariantID vid) +void DEL_DOT_VEC_2D::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_x, m_array_length, 0.0, vid); allocAndInitDataConst(m_y, m_array_length, 0.0, vid); @@ -91,12 +87,12 @@ void DEL_DOT_VEC_2D::setUp(VariantID vid) m_half = 0.5; } -void DEL_DOT_VEC_2D::updateChecksum(VariantID vid) +void DEL_DOT_VEC_2D::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_div, m_array_length); } -void DEL_DOT_VEC_2D::tearDown(VariantID vid) +void DEL_DOT_VEC_2D::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; diff --git a/src/apps/DEL_DOT_VEC_2D.hpp b/src/apps/DEL_DOT_VEC_2D.hpp index 5b155de54..b542af305 100644 --- a/src/apps/DEL_DOT_VEC_2D.hpp +++ b/src/apps/DEL_DOT_VEC_2D.hpp @@ -104,17 +104,18 @@ class DEL_DOT_VEC_2D : public KernelBase ~DEL_DOT_VEC_2D(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index 78aae9605..a751a4c34 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -120,7 +120,7 @@ __global__ void Diffusion3DPA(Index_type NE, const Real_ptr Basis, } } -void DIFFUSION3DPA::runCudaVariant(VariantID vid) { +void DIFFUSION3DPA::runCudaVariant(VariantID vid, size_t tid) { const Index_type run_reps = getRunReps(); DIFFUSION3DPA_DATA_SETUP; diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index 232900a58..33d7c208d 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -120,7 +120,7 @@ __global__ void Diffusion3DPA(Index_type NE, const Real_ptr Basis, } } -void DIFFUSION3DPA::runHipVariant(VariantID vid) { +void DIFFUSION3DPA::runHipVariant(VariantID vid, size_t tid) { const Index_type run_reps = getRunReps(); DIFFUSION3DPA_DATA_SETUP; diff --git a/src/apps/DIFFUSION3DPA-OMP.cpp b/src/apps/DIFFUSION3DPA-OMP.cpp index e4195e9f6..871c972c5 100644 --- a/src/apps/DIFFUSION3DPA-OMP.cpp +++ b/src/apps/DIFFUSION3DPA-OMP.cpp @@ -18,7 +18,7 @@ namespace rajaperf { namespace apps { -void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { +void DIFFUSION3DPA::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/DIFFUSION3DPA-OMPTarget.cpp b/src/apps/DIFFUSION3DPA-OMPTarget.cpp index 8d3368002..f64bec1c4 100644 --- a/src/apps/DIFFUSION3DPA-OMPTarget.cpp +++ b/src/apps/DIFFUSION3DPA-OMPTarget.cpp @@ -19,7 +19,7 @@ namespace rajaperf { namespace apps { -void DIFFUSION3DPA::runOpenMPTargetVariant(VariantID vid) { +void DIFFUSION3DPA::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); switch (vid) { diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index 21a7678ca..9793d648f 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -18,7 +18,7 @@ namespace rajaperf { namespace apps { -void DIFFUSION3DPA::runSeqVariant(VariantID vid) { +void DIFFUSION3DPA::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); DIFFUSION3DPA_DATA_SETUP; diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp index a73e4bed0..643d7f987 100644 --- a/src/apps/DIFFUSION3DPA.cpp +++ b/src/apps/DIFFUSION3DPA.cpp @@ -23,10 +23,6 @@ namespace apps DIFFUSION3DPA::DIFFUSION3DPA(const RunParams& params) : KernelBase(rajaperf::Apps_DIFFUSION3DPA, params) { - setDefaultGPUBlockSize( default_gpu_block_size ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - m_NE_default = 15625; setDefaultProblemSize(m_NE_default*DPA_Q1D*DPA_Q1D*DPA_Q1D); @@ -75,7 +71,7 @@ DIFFUSION3DPA::~DIFFUSION3DPA() { } -void DIFFUSION3DPA::setUp(VariantID vid) +void DIFFUSION3DPA::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_B, int(DPA_Q1D*DPA_D1D), Real_type(1.0), vid); @@ -85,12 +81,12 @@ void DIFFUSION3DPA::setUp(VariantID vid) allocAndInitDataConst(m_Y, int(DPA_D1D*DPA_D1D*DPA_D1D*m_NE), Real_type(0.0), vid); } -void DIFFUSION3DPA::updateChecksum(VariantID vid) +void DIFFUSION3DPA::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_Y, DPA_D1D*DPA_D1D*DPA_D1D*m_NE); } -void DIFFUSION3DPA::tearDown(VariantID vid) +void DIFFUSION3DPA::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index 876d61e4a..8bd8f77b6 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -488,15 +488,15 @@ class DIFFUSION3DPA : public KernelBase ~DIFFUSION3DPA(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); private: static const size_t default_gpu_block_size = DPA_Q1D * DPA_Q1D * DPA_Q1D; diff --git a/src/apps/ENERGY-Cuda.cpp b/src/apps/ENERGY-Cuda.cpp index 6b6c05c09..3d4d98662 100644 --- a/src/apps/ENERGY-Cuda.cpp +++ b/src/apps/ENERGY-Cuda.cpp @@ -268,7 +268,7 @@ void ENERGY::runCudaVariantImpl(VariantID vid) } } -void ENERGY::runCudaVariant(VariantID vid) +void ENERGY::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/apps/ENERGY-Hip.cpp b/src/apps/ENERGY-Hip.cpp index 6639d534e..0551fbf49 100644 --- a/src/apps/ENERGY-Hip.cpp +++ b/src/apps/ENERGY-Hip.cpp @@ -262,7 +262,7 @@ void ENERGY::runHipVariantImpl(VariantID vid) } } -void ENERGY::runHipVariant(VariantID vid) +void ENERGY::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/apps/ENERGY-OMP.cpp b/src/apps/ENERGY-OMP.cpp index 531da1e18..b839e4624 100644 --- a/src/apps/ENERGY-OMP.cpp +++ b/src/apps/ENERGY-OMP.cpp @@ -18,7 +18,7 @@ namespace apps { -void ENERGY::runOpenMPVariant(VariantID vid) +void ENERGY::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/ENERGY-OMPTarget.cpp b/src/apps/ENERGY-OMPTarget.cpp index a9b709ddd..a57d7ea52 100644 --- a/src/apps/ENERGY-OMPTarget.cpp +++ b/src/apps/ENERGY-OMPTarget.cpp @@ -65,7 +65,7 @@ namespace apps deallocOpenMPDeviceData(qq_old, did); \ deallocOpenMPDeviceData(vnewc, did); -void ENERGY::runOpenMPTargetVariant(VariantID vid) +void ENERGY::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/ENERGY-Seq.cpp b/src/apps/ENERGY-Seq.cpp index 7f13c9805..35bd92c15 100644 --- a/src/apps/ENERGY-Seq.cpp +++ b/src/apps/ENERGY-Seq.cpp @@ -18,7 +18,7 @@ namespace apps { -void ENERGY::runSeqVariant(VariantID vid) +void ENERGY::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp index 7d481f1ca..7c3b9b0f4 100644 --- a/src/apps/ENERGY.cpp +++ b/src/apps/ENERGY.cpp @@ -21,10 +21,6 @@ namespace apps ENERGY::ENERGY(const RunParams& params) : KernelBase(rajaperf::Apps_ENERGY, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(130); @@ -72,7 +68,7 @@ ENERGY::~ENERGY() { } -void ENERGY::setUp(VariantID vid) +void ENERGY::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_e_new, getActualProblemSize(), 0.0, vid); allocAndInitData(m_e_old, getActualProblemSize(), vid); @@ -96,13 +92,13 @@ void ENERGY::setUp(VariantID vid) initData(m_q_cut); } -void ENERGY::updateChecksum(VariantID vid) +void ENERGY::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_e_new, getActualProblemSize()); checksum[vid] += calcChecksum(m_q_new, getActualProblemSize()); } -void ENERGY::tearDown(VariantID vid) +void ENERGY::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; diff --git a/src/apps/ENERGY.hpp b/src/apps/ENERGY.hpp index d9443e4a1..4601f808a 100644 --- a/src/apps/ENERGY.hpp +++ b/src/apps/ENERGY.hpp @@ -194,17 +194,18 @@ class ENERGY : public KernelBase ~ENERGY(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/apps/FIR-Cuda.cpp b/src/apps/FIR-Cuda.cpp index e739817b2..5e22b1ad1 100644 --- a/src/apps/FIR-Cuda.cpp +++ b/src/apps/FIR-Cuda.cpp @@ -147,7 +147,7 @@ void FIR::runCudaVariantImpl(VariantID vid) } } -void FIR::runCudaVariant(VariantID vid) +void FIR::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/apps/FIR-Hip.cpp b/src/apps/FIR-Hip.cpp index b933ef74d..066a22368 100644 --- a/src/apps/FIR-Hip.cpp +++ b/src/apps/FIR-Hip.cpp @@ -147,7 +147,7 @@ void FIR::runHipVariantImpl(VariantID vid) } } -void FIR::runHipVariant(VariantID vid) +void FIR::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/apps/FIR-OMP.cpp b/src/apps/FIR-OMP.cpp index 8f011b920..b30c41436 100644 --- a/src/apps/FIR-OMP.cpp +++ b/src/apps/FIR-OMP.cpp @@ -19,7 +19,7 @@ namespace apps { -void FIR::runOpenMPVariant(VariantID vid) +void FIR::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/FIR-OMPTarget.cpp b/src/apps/FIR-OMPTarget.cpp index 0306d8378..104f78d48 100644 --- a/src/apps/FIR-OMPTarget.cpp +++ b/src/apps/FIR-OMPTarget.cpp @@ -46,7 +46,7 @@ namespace apps deallocOpenMPDeviceData(coeff, did); -void FIR::runOpenMPTargetVariant(VariantID vid) +void FIR::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/FIR-Seq.cpp b/src/apps/FIR-Seq.cpp index 27d2789ad..1c8852d0e 100644 --- a/src/apps/FIR-Seq.cpp +++ b/src/apps/FIR-Seq.cpp @@ -19,7 +19,7 @@ namespace apps { -void FIR::runSeqVariant(VariantID vid) +void FIR::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp index d28e3c587..022175d63 100644 --- a/src/apps/FIR.cpp +++ b/src/apps/FIR.cpp @@ -21,10 +21,6 @@ namespace apps FIR::FIR(const RunParams& params) : KernelBase(rajaperf::Apps_FIR, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(160); @@ -66,18 +62,18 @@ FIR::~FIR() { } -void FIR::setUp(VariantID vid) +void FIR::setUp(VariantID vid, size_t /*tid*/) { allocAndInitData(m_in, getActualProblemSize(), vid); allocAndInitDataConst(m_out, getActualProblemSize(), 0.0, vid); } -void FIR::updateChecksum(VariantID vid) +void FIR::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_out, getActualProblemSize(), checksum_scale_factor ); } -void FIR::tearDown(VariantID vid) +void FIR::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; diff --git a/src/apps/FIR.hpp b/src/apps/FIR.hpp index 3b798c6a9..968b98f47 100644 --- a/src/apps/FIR.hpp +++ b/src/apps/FIR.hpp @@ -69,17 +69,18 @@ class FIR : public KernelBase ~FIR(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/apps/HALOEXCHANGE-Cuda.cpp b/src/apps/HALOEXCHANGE-Cuda.cpp index cee429834..36e76e5b9 100644 --- a/src/apps/HALOEXCHANGE-Cuda.cpp +++ b/src/apps/HALOEXCHANGE-Cuda.cpp @@ -169,7 +169,7 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) } } -void HALOEXCHANGE::runCudaVariant(VariantID vid) +void HALOEXCHANGE::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/apps/HALOEXCHANGE-Hip.cpp b/src/apps/HALOEXCHANGE-Hip.cpp index 587d6d751..ec3386859 100644 --- a/src/apps/HALOEXCHANGE-Hip.cpp +++ b/src/apps/HALOEXCHANGE-Hip.cpp @@ -171,7 +171,7 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) } } -void HALOEXCHANGE::runHipVariant(VariantID vid) +void HALOEXCHANGE::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/apps/HALOEXCHANGE-OMP.cpp b/src/apps/HALOEXCHANGE-OMP.cpp index 7cac3ca3c..8bac49192 100644 --- a/src/apps/HALOEXCHANGE-OMP.cpp +++ b/src/apps/HALOEXCHANGE-OMP.cpp @@ -18,7 +18,7 @@ namespace apps { -void HALOEXCHANGE::runOpenMPVariant(VariantID vid) +void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/HALOEXCHANGE-OMPTarget.cpp b/src/apps/HALOEXCHANGE-OMPTarget.cpp index 22fe54522..d6e7da2d9 100644 --- a/src/apps/HALOEXCHANGE-OMPTarget.cpp +++ b/src/apps/HALOEXCHANGE-OMPTarget.cpp @@ -51,7 +51,7 @@ namespace apps } -void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid) +void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/HALOEXCHANGE-Seq.cpp b/src/apps/HALOEXCHANGE-Seq.cpp index 7a5ae5e17..f7bfc62dd 100644 --- a/src/apps/HALOEXCHANGE-Seq.cpp +++ b/src/apps/HALOEXCHANGE-Seq.cpp @@ -18,7 +18,7 @@ namespace apps { -void HALOEXCHANGE::runSeqVariant(VariantID vid) +void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/HALOEXCHANGE.cpp b/src/apps/HALOEXCHANGE.cpp index be1f12a3b..b3ce4a434 100644 --- a/src/apps/HALOEXCHANGE.cpp +++ b/src/apps/HALOEXCHANGE.cpp @@ -44,10 +44,6 @@ void destroy_unpack_lists(std::vector& unpack_index_lists, HALOEXCHANGE::HALOEXCHANGE(const RunParams& params) : KernelBase(rajaperf::Apps_HALOEXCHANGE, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - m_grid_dims_default[0] = 100; m_grid_dims_default[1] = 100; m_grid_dims_default[2] = 100; @@ -108,7 +104,7 @@ HALOEXCHANGE::~HALOEXCHANGE() { } -void HALOEXCHANGE::setUp(VariantID vid) +void HALOEXCHANGE::setUp(VariantID vid, size_t /*tid*/) { m_vars.resize(m_num_vars, nullptr); for (Index_type v = 0; v < m_num_vars; ++v) { @@ -136,14 +132,14 @@ void HALOEXCHANGE::setUp(VariantID vid) } } -void HALOEXCHANGE::updateChecksum(VariantID vid) +void HALOEXCHANGE::updateChecksum(VariantID vid, size_t tid) { for (Real_ptr var : m_vars) { checksum[vid] += calcChecksum(var, m_var_size); } } -void HALOEXCHANGE::tearDown(VariantID vid) +void HALOEXCHANGE::tearDown(VariantID vid, size_t /*tid*/) { for (int l = 0; l < s_num_neighbors; ++l) { deallocData(m_buffers[l]); diff --git a/src/apps/HALOEXCHANGE.hpp b/src/apps/HALOEXCHANGE.hpp index 8d833344c..a42c2851f 100644 --- a/src/apps/HALOEXCHANGE.hpp +++ b/src/apps/HALOEXCHANGE.hpp @@ -84,17 +84,18 @@ class HALOEXCHANGE : public KernelBase ~HALOEXCHANGE(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp index 4c99a5e7f..a9dc12b6f 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp @@ -270,7 +270,7 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) } } -void HALOEXCHANGE_FUSED::runCudaVariant(VariantID vid) +void HALOEXCHANGE_FUSED::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp index 0b82b961a..15c6fc19a 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp @@ -273,7 +273,7 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) } } -void HALOEXCHANGE_FUSED::runHipVariant(VariantID vid) +void HALOEXCHANGE_FUSED::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp b/src/apps/HALOEXCHANGE_FUSED-OMP.cpp index 0654cd6dc..ad0af8060 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-OMP.cpp @@ -18,7 +18,7 @@ namespace apps { -void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid) +void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp index 77323855c..5a886bd35 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp @@ -87,7 +87,7 @@ namespace apps delete[] h_unpack_ptrs; -void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid) +void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp b/src/apps/HALOEXCHANGE_FUSED-Seq.cpp index 80eab8629..022f9224a 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Seq.cpp @@ -18,7 +18,7 @@ namespace apps { -void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid) +void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/HALOEXCHANGE_FUSED.cpp b/src/apps/HALOEXCHANGE_FUSED.cpp index 89fd84a38..feb472728 100644 --- a/src/apps/HALOEXCHANGE_FUSED.cpp +++ b/src/apps/HALOEXCHANGE_FUSED.cpp @@ -44,10 +44,6 @@ void destroy_unpack_lists(std::vector& unpack_index_lists, HALOEXCHANGE_FUSED::HALOEXCHANGE_FUSED(const RunParams& params) : KernelBase(rajaperf::Apps_HALOEXCHANGE_FUSED, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - m_grid_dims_default[0] = 100; m_grid_dims_default[1] = 100; m_grid_dims_default[2] = 100; @@ -108,7 +104,7 @@ HALOEXCHANGE_FUSED::~HALOEXCHANGE_FUSED() { } -void HALOEXCHANGE_FUSED::setUp(VariantID vid) +void HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t /*tid*/) { m_vars.resize(m_num_vars, nullptr); for (Index_type v = 0; v < m_num_vars; ++v) { @@ -136,14 +132,14 @@ void HALOEXCHANGE_FUSED::setUp(VariantID vid) } } -void HALOEXCHANGE_FUSED::updateChecksum(VariantID vid) +void HALOEXCHANGE_FUSED::updateChecksum(VariantID vid, size_t tid) { for (Real_ptr var : m_vars) { checksum[vid] += calcChecksum(var, m_var_size); } } -void HALOEXCHANGE_FUSED::tearDown(VariantID vid) +void HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t /*tid*/) { for (int l = 0; l < s_num_neighbors; ++l) { deallocData(m_buffers[l]); diff --git a/src/apps/HALOEXCHANGE_FUSED.hpp b/src/apps/HALOEXCHANGE_FUSED.hpp index 5eba35edf..2a0d24689 100644 --- a/src/apps/HALOEXCHANGE_FUSED.hpp +++ b/src/apps/HALOEXCHANGE_FUSED.hpp @@ -128,17 +128,18 @@ class HALOEXCHANGE_FUSED : public KernelBase ~HALOEXCHANGE_FUSED(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/apps/LTIMES-Cuda.cpp b/src/apps/LTIMES-Cuda.cpp index 8f2f27a93..fc64211a1 100644 --- a/src/apps/LTIMES-Cuda.cpp +++ b/src/apps/LTIMES-Cuda.cpp @@ -188,7 +188,7 @@ void LTIMES::runCudaVariantImpl(VariantID vid) } } -void LTIMES::runCudaVariant(VariantID vid) +void LTIMES::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/apps/LTIMES-Hip.cpp b/src/apps/LTIMES-Hip.cpp index 8f1fdeeae..ee2aadba6 100644 --- a/src/apps/LTIMES-Hip.cpp +++ b/src/apps/LTIMES-Hip.cpp @@ -190,7 +190,7 @@ void LTIMES::runHipVariantImpl(VariantID vid) } } -void LTIMES::runHipVariant(VariantID vid) +void LTIMES::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/apps/LTIMES-OMP.cpp b/src/apps/LTIMES-OMP.cpp index 5ba4671a5..d0b24f70f 100644 --- a/src/apps/LTIMES-OMP.cpp +++ b/src/apps/LTIMES-OMP.cpp @@ -18,7 +18,7 @@ namespace apps { -void LTIMES::runOpenMPVariant(VariantID vid) +void LTIMES::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/LTIMES-OMPTarget.cpp b/src/apps/LTIMES-OMPTarget.cpp index 656900895..4a5d81ac1 100644 --- a/src/apps/LTIMES-OMPTarget.cpp +++ b/src/apps/LTIMES-OMPTarget.cpp @@ -36,7 +36,7 @@ namespace apps deallocOpenMPDeviceData(psidat, did); -void LTIMES::runOpenMPTargetVariant(VariantID vid) +void LTIMES::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/LTIMES-Seq.cpp b/src/apps/LTIMES-Seq.cpp index efa06701e..00ac10503 100644 --- a/src/apps/LTIMES-Seq.cpp +++ b/src/apps/LTIMES-Seq.cpp @@ -18,7 +18,7 @@ namespace apps { -void LTIMES::runSeqVariant(VariantID vid) +void LTIMES::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp index 3ef94b4fd..9b67a0aaf 100644 --- a/src/apps/LTIMES.cpp +++ b/src/apps/LTIMES.cpp @@ -23,10 +23,6 @@ namespace apps LTIMES::LTIMES(const RunParams& params) : KernelBase(rajaperf::Apps_LTIMES, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - m_num_d_default = 64; m_num_z_default = 488; m_num_g_default = 32; @@ -87,19 +83,19 @@ LTIMES::~LTIMES() { } -void LTIMES::setUp(VariantID vid) +void LTIMES::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_phidat, int(m_philen), Real_type(0.0), vid); allocAndInitData(m_elldat, int(m_elllen), vid); allocAndInitData(m_psidat, int(m_psilen), vid); } -void LTIMES::updateChecksum(VariantID vid) +void LTIMES::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_phidat, m_philen, checksum_scale_factor ); } -void LTIMES::tearDown(VariantID vid) +void LTIMES::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; diff --git a/src/apps/LTIMES.hpp b/src/apps/LTIMES.hpp index b0b308397..f3b0dd9cc 100644 --- a/src/apps/LTIMES.hpp +++ b/src/apps/LTIMES.hpp @@ -107,17 +107,18 @@ class LTIMES : public KernelBase ~LTIMES(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/apps/LTIMES_NOVIEW-Cuda.cpp b/src/apps/LTIMES_NOVIEW-Cuda.cpp index c4c69f28e..9efef1235 100644 --- a/src/apps/LTIMES_NOVIEW-Cuda.cpp +++ b/src/apps/LTIMES_NOVIEW-Cuda.cpp @@ -186,7 +186,7 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid) } } -void LTIMES_NOVIEW::runCudaVariant(VariantID vid) +void LTIMES_NOVIEW::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/apps/LTIMES_NOVIEW-Hip.cpp b/src/apps/LTIMES_NOVIEW-Hip.cpp index bd09fa2a3..6d62cd96d 100644 --- a/src/apps/LTIMES_NOVIEW-Hip.cpp +++ b/src/apps/LTIMES_NOVIEW-Hip.cpp @@ -189,7 +189,7 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid) } } -void LTIMES_NOVIEW::runHipVariant(VariantID vid) +void LTIMES_NOVIEW::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/apps/LTIMES_NOVIEW-OMP.cpp b/src/apps/LTIMES_NOVIEW-OMP.cpp index 7f6fedca2..737cd626a 100644 --- a/src/apps/LTIMES_NOVIEW-OMP.cpp +++ b/src/apps/LTIMES_NOVIEW-OMP.cpp @@ -18,7 +18,7 @@ namespace apps { -void LTIMES_NOVIEW::runOpenMPVariant(VariantID vid) +void LTIMES_NOVIEW::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/LTIMES_NOVIEW-OMPTarget.cpp b/src/apps/LTIMES_NOVIEW-OMPTarget.cpp index efe3a6ce3..df971449f 100644 --- a/src/apps/LTIMES_NOVIEW-OMPTarget.cpp +++ b/src/apps/LTIMES_NOVIEW-OMPTarget.cpp @@ -36,7 +36,7 @@ namespace apps deallocOpenMPDeviceData(psidat, did); -void LTIMES_NOVIEW::runOpenMPTargetVariant(VariantID vid) +void LTIMES_NOVIEW::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/LTIMES_NOVIEW-Seq.cpp b/src/apps/LTIMES_NOVIEW-Seq.cpp index 0f2458ac3..f8ef5b77e 100644 --- a/src/apps/LTIMES_NOVIEW-Seq.cpp +++ b/src/apps/LTIMES_NOVIEW-Seq.cpp @@ -18,7 +18,7 @@ namespace apps { -void LTIMES_NOVIEW::runSeqVariant(VariantID vid) +void LTIMES_NOVIEW::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp index 9d3d000e3..1b5c827fc 100644 --- a/src/apps/LTIMES_NOVIEW.cpp +++ b/src/apps/LTIMES_NOVIEW.cpp @@ -23,10 +23,6 @@ namespace apps LTIMES_NOVIEW::LTIMES_NOVIEW(const RunParams& params) : KernelBase(rajaperf::Apps_LTIMES_NOVIEW, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - m_num_d_default = 64; m_num_z_default = 488; m_num_g_default = 32; @@ -86,19 +82,19 @@ LTIMES_NOVIEW::~LTIMES_NOVIEW() { } -void LTIMES_NOVIEW::setUp(VariantID vid) +void LTIMES_NOVIEW::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_phidat, int(m_philen), Real_type(0.0), vid); allocAndInitData(m_elldat, int(m_elllen), vid); allocAndInitData(m_psidat, int(m_psilen), vid); } -void LTIMES_NOVIEW::updateChecksum(VariantID vid) +void LTIMES_NOVIEW::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_phidat, m_philen, checksum_scale_factor ); } -void LTIMES_NOVIEW::tearDown(VariantID vid) +void LTIMES_NOVIEW::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; diff --git a/src/apps/LTIMES_NOVIEW.hpp b/src/apps/LTIMES_NOVIEW.hpp index a5d1fc908..7657b22a6 100644 --- a/src/apps/LTIMES_NOVIEW.hpp +++ b/src/apps/LTIMES_NOVIEW.hpp @@ -57,17 +57,18 @@ class LTIMES_NOVIEW : public KernelBase ~LTIMES_NOVIEW(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp index 036fbe882..ecbcce255 100644 --- a/src/apps/MASS3DPA-Cuda.cpp +++ b/src/apps/MASS3DPA-Cuda.cpp @@ -102,7 +102,7 @@ __global__ void Mass3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt, } } -void MASS3DPA::runCudaVariant(VariantID vid) { +void MASS3DPA::runCudaVariant(VariantID vid, size_t tid) { const Index_type run_reps = getRunReps(); MASS3DPA_DATA_SETUP; diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index c820bedeb..a960f4996 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -102,7 +102,7 @@ __global__ void Mass3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt, } } -void MASS3DPA::runHipVariant(VariantID vid) { +void MASS3DPA::runHipVariant(VariantID vid, size_t tid) { const Index_type run_reps = getRunReps(); MASS3DPA_DATA_SETUP; diff --git a/src/apps/MASS3DPA-OMP.cpp b/src/apps/MASS3DPA-OMP.cpp index 95342832c..e359c82c2 100644 --- a/src/apps/MASS3DPA-OMP.cpp +++ b/src/apps/MASS3DPA-OMP.cpp @@ -19,7 +19,7 @@ namespace rajaperf { namespace apps { -void MASS3DPA::runOpenMPVariant(VariantID vid) { +void MASS3DPA::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/MASS3DPA-OMPTarget.cpp b/src/apps/MASS3DPA-OMPTarget.cpp index 3ba732420..a25dc654b 100644 --- a/src/apps/MASS3DPA-OMPTarget.cpp +++ b/src/apps/MASS3DPA-OMPTarget.cpp @@ -20,7 +20,7 @@ namespace rajaperf { namespace apps { -void MASS3DPA::runOpenMPTargetVariant(VariantID vid) { +void MASS3DPA::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); switch (vid) { diff --git a/src/apps/MASS3DPA-Seq.cpp b/src/apps/MASS3DPA-Seq.cpp index 710826042..7f0882d43 100644 --- a/src/apps/MASS3DPA-Seq.cpp +++ b/src/apps/MASS3DPA-Seq.cpp @@ -19,7 +19,7 @@ namespace rajaperf { namespace apps { -void MASS3DPA::runSeqVariant(VariantID vid) { +void MASS3DPA::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); MASS3DPA_DATA_SETUP; diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp index 70f21f9e1..6614d64c8 100644 --- a/src/apps/MASS3DPA.cpp +++ b/src/apps/MASS3DPA.cpp @@ -23,10 +23,6 @@ namespace apps MASS3DPA::MASS3DPA(const RunParams& params) : KernelBase(rajaperf::Apps_MASS3DPA, params) { - setDefaultGPUBlockSize( default_gpu_block_size ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - m_NE_default = 8000; setDefaultProblemSize(m_NE_default*MPA_Q1D*MPA_Q1D*MPA_Q1D); @@ -71,7 +67,7 @@ MASS3DPA::~MASS3DPA() { } -void MASS3DPA::setUp(VariantID vid) +void MASS3DPA::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_B, int(MPA_Q1D*MPA_D1D), Real_type(1.0), vid); @@ -81,12 +77,12 @@ void MASS3DPA::setUp(VariantID vid) allocAndInitDataConst(m_Y, int(MPA_D1D*MPA_D1D*MPA_D1D*m_NE), Real_type(0.0), vid); } -void MASS3DPA::updateChecksum(VariantID vid) +void MASS3DPA::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_Y, MPA_D1D*MPA_D1D*MPA_D1D*m_NE); } -void MASS3DPA::tearDown(VariantID vid) +void MASS3DPA::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; diff --git a/src/apps/MASS3DPA.hpp b/src/apps/MASS3DPA.hpp index c94b19fbe..dce4e55b0 100644 --- a/src/apps/MASS3DPA.hpp +++ b/src/apps/MASS3DPA.hpp @@ -368,15 +368,15 @@ class MASS3DPA : public KernelBase ~MASS3DPA(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); private: static const size_t default_gpu_block_size = MPA_Q1D * MPA_Q1D; diff --git a/src/apps/PRESSURE-Cuda.cpp b/src/apps/PRESSURE-Cuda.cpp index 3420a7262..6c122e2bc 100644 --- a/src/apps/PRESSURE-Cuda.cpp +++ b/src/apps/PRESSURE-Cuda.cpp @@ -136,7 +136,7 @@ void PRESSURE::runCudaVariantImpl(VariantID vid) } } -void PRESSURE::runCudaVariant(VariantID vid) +void PRESSURE::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/apps/PRESSURE-Hip.cpp b/src/apps/PRESSURE-Hip.cpp index 2b4b5a709..13247fb2a 100644 --- a/src/apps/PRESSURE-Hip.cpp +++ b/src/apps/PRESSURE-Hip.cpp @@ -129,7 +129,7 @@ void PRESSURE::runHipVariantImpl(VariantID vid) } } -void PRESSURE::runHipVariant(VariantID vid) +void PRESSURE::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/apps/PRESSURE-OMP.cpp b/src/apps/PRESSURE-OMP.cpp index 1393ff89b..55eb56632 100644 --- a/src/apps/PRESSURE-OMP.cpp +++ b/src/apps/PRESSURE-OMP.cpp @@ -18,7 +18,7 @@ namespace apps { -void PRESSURE::runOpenMPVariant(VariantID vid) +void PRESSURE::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/PRESSURE-OMPTarget.cpp b/src/apps/PRESSURE-OMPTarget.cpp index 90212cb30..5feb96c8b 100644 --- a/src/apps/PRESSURE-OMPTarget.cpp +++ b/src/apps/PRESSURE-OMPTarget.cpp @@ -45,7 +45,7 @@ namespace apps deallocOpenMPDeviceData(vnewc, did); -void PRESSURE::runOpenMPTargetVariant(VariantID vid) +void PRESSURE::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/PRESSURE-Seq.cpp b/src/apps/PRESSURE-Seq.cpp index 1b56941a3..4068815af 100644 --- a/src/apps/PRESSURE-Seq.cpp +++ b/src/apps/PRESSURE-Seq.cpp @@ -18,7 +18,7 @@ namespace apps { -void PRESSURE::runSeqVariant(VariantID vid) +void PRESSURE::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp index 3d6e34372..ab5721111 100644 --- a/src/apps/PRESSURE.cpp +++ b/src/apps/PRESSURE.cpp @@ -21,10 +21,6 @@ namespace apps PRESSURE::PRESSURE(const RunParams& params) : KernelBase(rajaperf::Apps_PRESSURE, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(700); @@ -62,7 +58,7 @@ PRESSURE::~PRESSURE() { } -void PRESSURE::setUp(VariantID vid) +void PRESSURE::setUp(VariantID vid, size_t /*tid*/) { allocAndInitData(m_compression, getActualProblemSize(), vid); allocAndInitData(m_bvc, getActualProblemSize(), vid); @@ -76,12 +72,12 @@ void PRESSURE::setUp(VariantID vid) initData(m_eosvmax); } -void PRESSURE::updateChecksum(VariantID vid) +void PRESSURE::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_p_new, getActualProblemSize()); } -void PRESSURE::tearDown(VariantID vid) +void PRESSURE::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; diff --git a/src/apps/PRESSURE.hpp b/src/apps/PRESSURE.hpp index 032ec70ab..f7cfd14d5 100644 --- a/src/apps/PRESSURE.hpp +++ b/src/apps/PRESSURE.hpp @@ -63,17 +63,18 @@ class PRESSURE : public KernelBase ~PRESSURE(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/apps/VOL3D-Cuda.cpp b/src/apps/VOL3D-Cuda.cpp index 98664cea6..059988124 100644 --- a/src/apps/VOL3D-Cuda.cpp +++ b/src/apps/VOL3D-Cuda.cpp @@ -123,7 +123,7 @@ void VOL3D::runCudaVariantImpl(VariantID vid) } } -void VOL3D::runCudaVariant(VariantID vid) +void VOL3D::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/apps/VOL3D-Hip.cpp b/src/apps/VOL3D-Hip.cpp index 69d7bbccb..00211a181 100644 --- a/src/apps/VOL3D-Hip.cpp +++ b/src/apps/VOL3D-Hip.cpp @@ -123,7 +123,7 @@ void VOL3D::runHipVariantImpl(VariantID vid) } } -void VOL3D::runHipVariant(VariantID vid) +void VOL3D::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/apps/VOL3D-OMP.cpp b/src/apps/VOL3D-OMP.cpp index 276fda6ee..850980bf6 100644 --- a/src/apps/VOL3D-OMP.cpp +++ b/src/apps/VOL3D-OMP.cpp @@ -20,7 +20,7 @@ namespace apps { -void VOL3D::runOpenMPVariant(VariantID vid) +void VOL3D::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/VOL3D-OMPTarget.cpp b/src/apps/VOL3D-OMPTarget.cpp index 38771ae62..7737c011b 100644 --- a/src/apps/VOL3D-OMPTarget.cpp +++ b/src/apps/VOL3D-OMPTarget.cpp @@ -45,7 +45,7 @@ namespace apps deallocOpenMPDeviceData(vol, did); -void VOL3D::runOpenMPTargetVariant(VariantID vid) +void VOL3D::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = m_domain->fpz; diff --git a/src/apps/VOL3D-Seq.cpp b/src/apps/VOL3D-Seq.cpp index f2fec0ef8..493c394a0 100644 --- a/src/apps/VOL3D-Seq.cpp +++ b/src/apps/VOL3D-Seq.cpp @@ -20,7 +20,7 @@ namespace apps { -void VOL3D::runSeqVariant(VariantID vid) +void VOL3D::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = m_domain->fpz; diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp index db2c59b32..494c5f0da 100644 --- a/src/apps/VOL3D.cpp +++ b/src/apps/VOL3D.cpp @@ -25,10 +25,6 @@ namespace apps VOL3D::VOL3D(const RunParams& params) : KernelBase(rajaperf::Apps_VOL3D, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(100*100*100); // See rzmax in ADomain struct setDefaultReps(100); @@ -75,7 +71,7 @@ VOL3D::~VOL3D() delete m_domain; } -void VOL3D::setUp(VariantID vid) +void VOL3D::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_x, m_array_length, 0.0, vid); allocAndInitDataConst(m_y, m_array_length, 0.0, vid); @@ -91,12 +87,12 @@ void VOL3D::setUp(VariantID vid) m_vnormq = 0.083333333333333333; /* vnormq = 1/12 */ } -void VOL3D::updateChecksum(VariantID vid) +void VOL3D::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_vol, m_array_length, checksum_scale_factor ); } -void VOL3D::tearDown(VariantID vid) +void VOL3D::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; diff --git a/src/apps/VOL3D.hpp b/src/apps/VOL3D.hpp index dae03eb77..d9f58be6b 100644 --- a/src/apps/VOL3D.hpp +++ b/src/apps/VOL3D.hpp @@ -160,17 +160,18 @@ class VOL3D : public KernelBase ~VOL3D(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/apps/WIP-COUPLE.cpp b/src/apps/WIP-COUPLE.cpp index b849f70f9..a53057d63 100644 --- a/src/apps/WIP-COUPLE.cpp +++ b/src/apps/WIP-COUPLE.cpp @@ -58,7 +58,7 @@ COUPLE::~COUPLE() delete m_domain; } -void COUPLE::setUp(VariantID vid) +void COUPLE::setUp(VariantID vid, size_t /*tid*/) { Index_type max_loop_index = m_domain->lrn; @@ -179,7 +179,7 @@ void COUPLE::runKernel(VariantID vid) } } -void COUPLE::updateChecksum(VariantID vid) +void COUPLE::updateChecksum(VariantID vid, size_t tid) { Index_type max_loop_index = m_domain->lrn; @@ -188,7 +188,7 @@ void COUPLE::updateChecksum(VariantID vid) checksum[vid] += calcChecksum(m_t2, max_loop_index); } -void COUPLE::tearDown(VariantID vid) +void COUPLE::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; diff --git a/src/apps/WIP-COUPLE.hpp b/src/apps/WIP-COUPLE.hpp index e5040ea57..c0aa77963 100644 --- a/src/apps/WIP-COUPLE.hpp +++ b/src/apps/WIP-COUPLE.hpp @@ -161,16 +161,16 @@ class COUPLE : public KernelBase ~COUPLE(); - void setUp(VariantID vid); + void setUp(VariantID vid, size_t tid); void runKernel(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid) {(void) vid;} - void runOpenMPVariant(VariantID vid) {(void) vid;} - void runCudaVariant(VariantID vid) {(void) vid;} - void runHipVariant(VariantID vid) {(void) vid;} - void runOpenMPTargetVariant(VariantID vid) {(void) vid;} + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid) {(void) vid;} + void runOpenMPVariant(VariantID vid, size_t tid) {(void) vid;} + void runCudaVariant(VariantID vid, size_t tid) {(void) vid;} + void runHipVariant(VariantID vid, size_t tid) {(void) vid;} + void runOpenMPTargetVariant(VariantID vid, size_t tid) {(void) vid;} private: Complex_ptr m_t0; diff --git a/src/basic/DAXPY-Hip.cpp b/src/basic/DAXPY-Hip.cpp index 691d40068..386fc27eb 100644 --- a/src/basic/DAXPY-Hip.cpp +++ b/src/basic/DAXPY-Hip.cpp @@ -113,7 +113,7 @@ void DAXPY::runHipVariantImpl(VariantID vid) } } -void DAXPY::runHipVariant(VariantID vid) +void DAXPY::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/basic/DAXPY-OMP.cpp b/src/basic/DAXPY-OMP.cpp index f28c83c7b..ca83b17ae 100644 --- a/src/basic/DAXPY-OMP.cpp +++ b/src/basic/DAXPY-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void DAXPY::runOpenMPVariant(VariantID vid) +void DAXPY::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/DAXPY-OMPTarget.cpp b/src/basic/DAXPY-OMPTarget.cpp index 930438bbc..b5d923d5c 100644 --- a/src/basic/DAXPY-OMPTarget.cpp +++ b/src/basic/DAXPY-OMPTarget.cpp @@ -39,7 +39,7 @@ namespace basic deallocOpenMPDeviceData(y, did); -void DAXPY::runOpenMPTargetVariant(VariantID vid) +void DAXPY::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/DAXPY-Seq.cpp b/src/basic/DAXPY-Seq.cpp index 2eb2fc690..69a0d8440 100644 --- a/src/basic/DAXPY-Seq.cpp +++ b/src/basic/DAXPY-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void DAXPY::runSeqVariant(VariantID vid) +void DAXPY::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp index 578b52f42..e3613ee34 100644 --- a/src/basic/DAXPY.cpp +++ b/src/basic/DAXPY.cpp @@ -57,7 +57,7 @@ DAXPY::~DAXPY() { } -void DAXPY::setUp(VariantID vid) +void DAXPY::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_y, getActualProblemSize(), 0.0, vid); allocAndInitData(m_x, getActualProblemSize(), vid); @@ -69,7 +69,7 @@ void DAXPY::updateChecksum(VariantID vid, size_t tid) checksum[vid].at(tid) += calcChecksum(m_y, getActualProblemSize()); } -void DAXPY::tearDown(VariantID vid) +void DAXPY::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_x); diff --git a/src/basic/DAXPY.hpp b/src/basic/DAXPY.hpp index 4ee25ab6b..e3f2d264a 100644 --- a/src/basic/DAXPY.hpp +++ b/src/basic/DAXPY.hpp @@ -43,9 +43,9 @@ class DAXPY : public KernelBase ~DAXPY(); - void setUp(VariantID vid); + void setUp(VariantID vid, size_t tid); void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid); + void tearDown(VariantID vid, size_t tid); void runSeqVariant(VariantID vid, size_t tid); void runOpenMPVariant(VariantID vid, size_t tid); @@ -54,6 +54,7 @@ class DAXPY : public KernelBase void runOpenMPTargetVariant(VariantID vid, size_t tid); void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/basic/IF_QUAD-Cuda.cpp b/src/basic/IF_QUAD-Cuda.cpp index aeac99d1f..adc13e08d 100644 --- a/src/basic/IF_QUAD-Cuda.cpp +++ b/src/basic/IF_QUAD-Cuda.cpp @@ -117,7 +117,7 @@ void IF_QUAD::runCudaVariantImpl(VariantID vid) } } -void IF_QUAD::runCudaVariant(VariantID vid) +void IF_QUAD::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/basic/IF_QUAD-Hip.cpp b/src/basic/IF_QUAD-Hip.cpp index b9a580fa2..9b0c8e49b 100644 --- a/src/basic/IF_QUAD-Hip.cpp +++ b/src/basic/IF_QUAD-Hip.cpp @@ -120,7 +120,7 @@ void IF_QUAD::runHipVariantImpl(VariantID vid) } } -void IF_QUAD::runHipVariant(VariantID vid) +void IF_QUAD::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/basic/IF_QUAD-OMP.cpp b/src/basic/IF_QUAD-OMP.cpp index 517814c6c..b17e0f65a 100644 --- a/src/basic/IF_QUAD-OMP.cpp +++ b/src/basic/IF_QUAD-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void IF_QUAD::runOpenMPVariant(VariantID vid) +void IF_QUAD::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/IF_QUAD-OMPTarget.cpp b/src/basic/IF_QUAD-OMPTarget.cpp index e44711ecb..f965ae2b7 100644 --- a/src/basic/IF_QUAD-OMPTarget.cpp +++ b/src/basic/IF_QUAD-OMPTarget.cpp @@ -45,7 +45,7 @@ namespace basic deallocOpenMPDeviceData(x1, did); \ deallocOpenMPDeviceData(x2, did); -void IF_QUAD::runOpenMPTargetVariant(VariantID vid) +void IF_QUAD::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/IF_QUAD-Seq.cpp b/src/basic/IF_QUAD-Seq.cpp index a5e9a9c6c..1e08bb9c0 100644 --- a/src/basic/IF_QUAD-Seq.cpp +++ b/src/basic/IF_QUAD-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void IF_QUAD::runSeqVariant(VariantID vid) +void IF_QUAD::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp index ec1c81fc3..1a599931f 100644 --- a/src/basic/IF_QUAD.cpp +++ b/src/basic/IF_QUAD.cpp @@ -21,10 +21,6 @@ namespace basic IF_QUAD::IF_QUAD(const RunParams& params) : KernelBase(rajaperf::Basic_IF_QUAD, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(180); @@ -65,7 +61,7 @@ IF_QUAD::~IF_QUAD() { } -void IF_QUAD::setUp(VariantID vid) +void IF_QUAD::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataRandSign(m_a, getActualProblemSize(), vid); allocAndInitData(m_b, getActualProblemSize(), vid); @@ -74,13 +70,13 @@ void IF_QUAD::setUp(VariantID vid) allocAndInitDataConst(m_x2, getActualProblemSize(), 0.0, vid); } -void IF_QUAD::updateChecksum(VariantID vid) +void IF_QUAD::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_x1, getActualProblemSize(), checksum_scale_factor ); checksum[vid] += calcChecksum(m_x2, getActualProblemSize(), checksum_scale_factor ); } -void IF_QUAD::tearDown(VariantID vid) +void IF_QUAD::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_a); diff --git a/src/basic/IF_QUAD.hpp b/src/basic/IF_QUAD.hpp index 4bcb84b9c..e0d85a25d 100644 --- a/src/basic/IF_QUAD.hpp +++ b/src/basic/IF_QUAD.hpp @@ -60,17 +60,18 @@ class IF_QUAD : public KernelBase ~IF_QUAD(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/basic/INIT3-Cuda.cpp b/src/basic/INIT3-Cuda.cpp index a570f6994..c2f3fa753 100644 --- a/src/basic/INIT3-Cuda.cpp +++ b/src/basic/INIT3-Cuda.cpp @@ -119,7 +119,7 @@ void INIT3::runCudaVariantImpl(VariantID vid) } } -void INIT3::runCudaVariant(VariantID vid) +void INIT3::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/basic/INIT3-Hip.cpp b/src/basic/INIT3-Hip.cpp index 672b1a0a5..c16b1e546 100644 --- a/src/basic/INIT3-Hip.cpp +++ b/src/basic/INIT3-Hip.cpp @@ -121,7 +121,7 @@ void INIT3::runHipVariantImpl(VariantID vid) } } -void INIT3::runHipVariant(VariantID vid) +void INIT3::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/basic/INIT3-OMP.cpp b/src/basic/INIT3-OMP.cpp index b5bd688e0..962b7fe3e 100644 --- a/src/basic/INIT3-OMP.cpp +++ b/src/basic/INIT3-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void INIT3::runOpenMPVariant(VariantID vid) +void INIT3::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/INIT3-OMPTarget.cpp b/src/basic/INIT3-OMPTarget.cpp index ff7d69e63..4a7e5084a 100644 --- a/src/basic/INIT3-OMPTarget.cpp +++ b/src/basic/INIT3-OMPTarget.cpp @@ -47,7 +47,7 @@ namespace basic deallocOpenMPDeviceData(in2, did); -void INIT3::runOpenMPTargetVariant(VariantID vid) +void INIT3::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/INIT3-Seq.cpp b/src/basic/INIT3-Seq.cpp index 66b8f5a7c..8926b7565 100644 --- a/src/basic/INIT3-Seq.cpp +++ b/src/basic/INIT3-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void INIT3::runSeqVariant(VariantID vid) +void INIT3::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp index 9cc6bf063..623942044 100644 --- a/src/basic/INIT3.cpp +++ b/src/basic/INIT3.cpp @@ -21,10 +21,6 @@ namespace basic INIT3::INIT3(const RunParams& params) : KernelBase(rajaperf::Basic_INIT3, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(500); @@ -61,7 +57,7 @@ INIT3::~INIT3() { } -void INIT3::setUp(VariantID vid) +void INIT3::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_out1, getActualProblemSize(), 0.0, vid); allocAndInitDataConst(m_out2, getActualProblemSize(), 0.0, vid); @@ -70,14 +66,14 @@ void INIT3::setUp(VariantID vid) allocAndInitData(m_in2, getActualProblemSize(), vid); } -void INIT3::updateChecksum(VariantID vid) +void INIT3::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_out1, getActualProblemSize()); checksum[vid] += calcChecksum(m_out2, getActualProblemSize()); checksum[vid] += calcChecksum(m_out3, getActualProblemSize()); } -void INIT3::tearDown(VariantID vid) +void INIT3::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_out1); diff --git a/src/basic/INIT3.hpp b/src/basic/INIT3.hpp index 0db0bbb17..aca934665 100644 --- a/src/basic/INIT3.hpp +++ b/src/basic/INIT3.hpp @@ -46,17 +46,18 @@ class INIT3 : public KernelBase ~INIT3(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/basic/INIT_VIEW1D-Cuda.cpp b/src/basic/INIT_VIEW1D-Cuda.cpp index 719ba30ff..30f964406 100644 --- a/src/basic/INIT_VIEW1D-Cuda.cpp +++ b/src/basic/INIT_VIEW1D-Cuda.cpp @@ -110,7 +110,7 @@ void INIT_VIEW1D::runCudaVariantImpl(VariantID vid) } } -void INIT_VIEW1D::runCudaVariant(VariantID vid) +void INIT_VIEW1D::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/basic/INIT_VIEW1D-Hip.cpp b/src/basic/INIT_VIEW1D-Hip.cpp index 96fd49166..415c66b5d 100644 --- a/src/basic/INIT_VIEW1D-Hip.cpp +++ b/src/basic/INIT_VIEW1D-Hip.cpp @@ -113,7 +113,7 @@ void INIT_VIEW1D::runHipVariantImpl(VariantID vid) } } -void INIT_VIEW1D::runHipVariant(VariantID vid) +void INIT_VIEW1D::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/basic/INIT_VIEW1D-OMP.cpp b/src/basic/INIT_VIEW1D-OMP.cpp index 42cc23c39..21d4584dc 100644 --- a/src/basic/INIT_VIEW1D-OMP.cpp +++ b/src/basic/INIT_VIEW1D-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void INIT_VIEW1D::runOpenMPVariant(VariantID vid) +void INIT_VIEW1D::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/INIT_VIEW1D-OMPTarget.cpp b/src/basic/INIT_VIEW1D-OMPTarget.cpp index 8ec0d5c13..2a0b1400e 100644 --- a/src/basic/INIT_VIEW1D-OMPTarget.cpp +++ b/src/basic/INIT_VIEW1D-OMPTarget.cpp @@ -37,7 +37,7 @@ namespace basic deallocOpenMPDeviceData(a, did); -void INIT_VIEW1D::runOpenMPTargetVariant(VariantID vid) +void INIT_VIEW1D::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/INIT_VIEW1D-Seq.cpp b/src/basic/INIT_VIEW1D-Seq.cpp index 419e1698b..40bd25dbe 100644 --- a/src/basic/INIT_VIEW1D-Seq.cpp +++ b/src/basic/INIT_VIEW1D-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void INIT_VIEW1D::runSeqVariant(VariantID vid) +void INIT_VIEW1D::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp index f6e257058..fddbeeddf 100644 --- a/src/basic/INIT_VIEW1D.cpp +++ b/src/basic/INIT_VIEW1D.cpp @@ -21,10 +21,6 @@ namespace basic INIT_VIEW1D::INIT_VIEW1D(const RunParams& params) : KernelBase(rajaperf::Basic_INIT_VIEW1D, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(2500); @@ -62,18 +58,18 @@ INIT_VIEW1D::~INIT_VIEW1D() { } -void INIT_VIEW1D::setUp(VariantID vid) +void INIT_VIEW1D::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_a, getActualProblemSize(), 0.0, vid); m_val = 0.00000123; } -void INIT_VIEW1D::updateChecksum(VariantID vid) +void INIT_VIEW1D::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_a, getActualProblemSize()); } -void INIT_VIEW1D::tearDown(VariantID vid) +void INIT_VIEW1D::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_a); diff --git a/src/basic/INIT_VIEW1D.hpp b/src/basic/INIT_VIEW1D.hpp index 23f6ede7b..10c3f53c4 100644 --- a/src/basic/INIT_VIEW1D.hpp +++ b/src/basic/INIT_VIEW1D.hpp @@ -57,17 +57,18 @@ class INIT_VIEW1D : public KernelBase ~INIT_VIEW1D(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp index 407ec672d..fe331db1c 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp @@ -113,7 +113,7 @@ void INIT_VIEW1D_OFFSET::runCudaVariantImpl(VariantID vid) } } -void INIT_VIEW1D_OFFSET::runCudaVariant(VariantID vid) +void INIT_VIEW1D_OFFSET::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp index 941bf25e8..40d061b18 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp @@ -114,7 +114,7 @@ void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid) } } -void INIT_VIEW1D_OFFSET::runHipVariant(VariantID vid) +void INIT_VIEW1D_OFFSET::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp b/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp index 1e07407aa..4c41bb094 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void INIT_VIEW1D_OFFSET::runOpenMPVariant(VariantID vid) +void INIT_VIEW1D_OFFSET::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp index 9b3c3cdae..1afea189f 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp @@ -37,7 +37,7 @@ namespace basic deallocOpenMPDeviceData(a, did); -void INIT_VIEW1D_OFFSET::runOpenMPTargetVariant(VariantID vid) +void INIT_VIEW1D_OFFSET::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; diff --git a/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp b/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp index f53872d14..c79314231 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void INIT_VIEW1D_OFFSET::runSeqVariant(VariantID vid) +void INIT_VIEW1D_OFFSET::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp index 74f4e68be..9ebc35193 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET.cpp @@ -21,10 +21,6 @@ namespace basic INIT_VIEW1D_OFFSET::INIT_VIEW1D_OFFSET(const RunParams& params) : KernelBase(rajaperf::Basic_INIT_VIEW1D_OFFSET, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(2500); @@ -62,18 +58,18 @@ INIT_VIEW1D_OFFSET::~INIT_VIEW1D_OFFSET() { } -void INIT_VIEW1D_OFFSET::setUp(VariantID vid) +void INIT_VIEW1D_OFFSET::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_a, getActualProblemSize(), 0.0, vid); m_val = 0.00000123; } -void INIT_VIEW1D_OFFSET::updateChecksum(VariantID vid) +void INIT_VIEW1D_OFFSET::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_a, getActualProblemSize()); } -void INIT_VIEW1D_OFFSET::tearDown(VariantID vid) +void INIT_VIEW1D_OFFSET::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_a); diff --git a/src/basic/INIT_VIEW1D_OFFSET.hpp b/src/basic/INIT_VIEW1D_OFFSET.hpp index b1b8691fc..3d6feb3a0 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.hpp +++ b/src/basic/INIT_VIEW1D_OFFSET.hpp @@ -56,17 +56,18 @@ class INIT_VIEW1D_OFFSET : public KernelBase ~INIT_VIEW1D_OFFSET(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/basic/MAT_MAT_SHARED-Cuda.cpp b/src/basic/MAT_MAT_SHARED-Cuda.cpp index bd8f52548..7d143a928 100644 --- a/src/basic/MAT_MAT_SHARED-Cuda.cpp +++ b/src/basic/MAT_MAT_SHARED-Cuda.cpp @@ -305,7 +305,7 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) } } -void MAT_MAT_SHARED::runCudaVariant(VariantID vid) +void MAT_MAT_SHARED::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/basic/MAT_MAT_SHARED-Hip.cpp b/src/basic/MAT_MAT_SHARED-Hip.cpp index 3b8af7a97..1e6bb9f24 100644 --- a/src/basic/MAT_MAT_SHARED-Hip.cpp +++ b/src/basic/MAT_MAT_SHARED-Hip.cpp @@ -308,7 +308,7 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) } } -void MAT_MAT_SHARED::runHipVariant(VariantID vid) +void MAT_MAT_SHARED::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/basic/MAT_MAT_SHARED-OMP.cpp b/src/basic/MAT_MAT_SHARED-OMP.cpp index e3a301b88..e9d232f37 100644 --- a/src/basic/MAT_MAT_SHARED-OMP.cpp +++ b/src/basic/MAT_MAT_SHARED-OMP.cpp @@ -15,7 +15,7 @@ namespace rajaperf { namespace basic { -void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) { +void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) const Index_type run_reps = getRunReps(); diff --git a/src/basic/MAT_MAT_SHARED-OMPTarget.cpp b/src/basic/MAT_MAT_SHARED-OMPTarget.cpp index b42b3b9d0..9b65dc204 100644 --- a/src/basic/MAT_MAT_SHARED-OMPTarget.cpp +++ b/src/basic/MAT_MAT_SHARED-OMPTarget.cpp @@ -20,7 +20,7 @@ namespace rajaperf { namespace basic { - void MAT_MAT_SHARED::runOpenMPTargetVariant(VariantID vid) { + void MAT_MAT_SHARED::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); switch (vid) { diff --git a/src/basic/MAT_MAT_SHARED-Seq.cpp b/src/basic/MAT_MAT_SHARED-Seq.cpp index 8d8e9f65f..dd46ea46d 100644 --- a/src/basic/MAT_MAT_SHARED-Seq.cpp +++ b/src/basic/MAT_MAT_SHARED-Seq.cpp @@ -13,7 +13,7 @@ namespace rajaperf { namespace basic { -void MAT_MAT_SHARED::runSeqVariant(VariantID vid) { +void MAT_MAT_SHARED::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type N = m_N; diff --git a/src/basic/MAT_MAT_SHARED.cpp b/src/basic/MAT_MAT_SHARED.cpp index 91f19a42b..82a4a07ed 100644 --- a/src/basic/MAT_MAT_SHARED.cpp +++ b/src/basic/MAT_MAT_SHARED.cpp @@ -20,10 +20,6 @@ namespace basic { MAT_MAT_SHARED::MAT_MAT_SHARED(const RunParams ¶ms) : KernelBase(rajaperf::Basic_MAT_MAT_SHARED, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - m_N_default = 1000; setDefaultProblemSize(m_N_default*m_N_default); setDefaultReps(5); @@ -68,7 +64,7 @@ MAT_MAT_SHARED::MAT_MAT_SHARED(const RunParams ¶ms) MAT_MAT_SHARED::~MAT_MAT_SHARED() {} -void MAT_MAT_SHARED::setUp(VariantID vid) { +void MAT_MAT_SHARED::setUp(VariantID vid, size_t /*tid*/) { const Index_type NN = m_N * m_N; allocAndInitDataConst(m_A, NN, 1.0, vid); @@ -76,11 +72,11 @@ void MAT_MAT_SHARED::setUp(VariantID vid) { allocAndInitDataConst(m_C, NN, 0.0, vid); } -void MAT_MAT_SHARED::updateChecksum(VariantID vid) { +void MAT_MAT_SHARED::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_C, m_N*m_N, checksum_scale_factor ); } -void MAT_MAT_SHARED::tearDown(VariantID vid) { +void MAT_MAT_SHARED::tearDown(VariantID vid, size_t /*tid*/) { (void)vid; deallocData(m_A); deallocData(m_B); diff --git a/src/basic/MAT_MAT_SHARED.hpp b/src/basic/MAT_MAT_SHARED.hpp index 08dfd4ece..b689113dc 100644 --- a/src/basic/MAT_MAT_SHARED.hpp +++ b/src/basic/MAT_MAT_SHARED.hpp @@ -146,17 +146,18 @@ class MAT_MAT_SHARED : public KernelBase { ~MAT_MAT_SHARED(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/basic/MULADDSUB-Cuda.cpp b/src/basic/MULADDSUB-Cuda.cpp index ebd8f1f15..402efdc84 100644 --- a/src/basic/MULADDSUB-Cuda.cpp +++ b/src/basic/MULADDSUB-Cuda.cpp @@ -119,7 +119,7 @@ void MULADDSUB::runCudaVariantImpl(VariantID vid) } } -void MULADDSUB::runCudaVariant(VariantID vid) +void MULADDSUB::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/basic/MULADDSUB-Hip.cpp b/src/basic/MULADDSUB-Hip.cpp index 45b706cfa..dcf579575 100644 --- a/src/basic/MULADDSUB-Hip.cpp +++ b/src/basic/MULADDSUB-Hip.cpp @@ -121,7 +121,7 @@ void MULADDSUB::runHipVariantImpl(VariantID vid) } } -void MULADDSUB::runHipVariant(VariantID vid) +void MULADDSUB::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/basic/MULADDSUB-OMP.cpp b/src/basic/MULADDSUB-OMP.cpp index 79d441bcc..a3af22ea9 100644 --- a/src/basic/MULADDSUB-OMP.cpp +++ b/src/basic/MULADDSUB-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void MULADDSUB::runOpenMPVariant(VariantID vid) +void MULADDSUB::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/MULADDSUB-OMPTarget.cpp b/src/basic/MULADDSUB-OMPTarget.cpp index bca0164e3..f99bfb8f6 100644 --- a/src/basic/MULADDSUB-OMPTarget.cpp +++ b/src/basic/MULADDSUB-OMPTarget.cpp @@ -47,7 +47,7 @@ namespace basic deallocOpenMPDeviceData(in2, did); -void MULADDSUB::runOpenMPTargetVariant(VariantID vid) +void MULADDSUB::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/MULADDSUB-Seq.cpp b/src/basic/MULADDSUB-Seq.cpp index e6fb9f913..f846e76e1 100644 --- a/src/basic/MULADDSUB-Seq.cpp +++ b/src/basic/MULADDSUB-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void MULADDSUB::runSeqVariant(VariantID vid) +void MULADDSUB::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp index 7431a634c..91b8bd558 100644 --- a/src/basic/MULADDSUB.cpp +++ b/src/basic/MULADDSUB.cpp @@ -21,10 +21,6 @@ namespace basic MULADDSUB::MULADDSUB(const RunParams& params) : KernelBase(rajaperf::Basic_MULADDSUB, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(350); @@ -61,7 +57,7 @@ MULADDSUB::~MULADDSUB() { } -void MULADDSUB::setUp(VariantID vid) +void MULADDSUB::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_out1, getActualProblemSize(), 0.0, vid); allocAndInitDataConst(m_out2, getActualProblemSize(), 0.0, vid); @@ -70,14 +66,14 @@ void MULADDSUB::setUp(VariantID vid) allocAndInitData(m_in2, getActualProblemSize(), vid); } -void MULADDSUB::updateChecksum(VariantID vid) +void MULADDSUB::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_out1, getActualProblemSize()); checksum[vid] += calcChecksum(m_out2, getActualProblemSize()); checksum[vid] += calcChecksum(m_out3, getActualProblemSize()); } -void MULADDSUB::tearDown(VariantID vid) +void MULADDSUB::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_out1); diff --git a/src/basic/MULADDSUB.hpp b/src/basic/MULADDSUB.hpp index ac9b10ff6..0fed087f9 100644 --- a/src/basic/MULADDSUB.hpp +++ b/src/basic/MULADDSUB.hpp @@ -49,17 +49,18 @@ class MULADDSUB : public KernelBase ~MULADDSUB(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/basic/NESTED_INIT-Cuda.cpp b/src/basic/NESTED_INIT-Cuda.cpp index d55944cbc..0abde5ee4 100644 --- a/src/basic/NESTED_INIT-Cuda.cpp +++ b/src/basic/NESTED_INIT-Cuda.cpp @@ -172,7 +172,7 @@ void NESTED_INIT::runCudaVariantImpl(VariantID vid) } } -void NESTED_INIT::runCudaVariant(VariantID vid) +void NESTED_INIT::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/basic/NESTED_INIT-Hip.cpp b/src/basic/NESTED_INIT-Hip.cpp index 120938118..5d9409b82 100644 --- a/src/basic/NESTED_INIT-Hip.cpp +++ b/src/basic/NESTED_INIT-Hip.cpp @@ -174,7 +174,7 @@ void NESTED_INIT::runHipVariantImpl(VariantID vid) } } -void NESTED_INIT::runHipVariant(VariantID vid) +void NESTED_INIT::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/basic/NESTED_INIT-OMP.cpp b/src/basic/NESTED_INIT-OMP.cpp index a38521976..6059a407a 100644 --- a/src/basic/NESTED_INIT-OMP.cpp +++ b/src/basic/NESTED_INIT-OMP.cpp @@ -21,7 +21,7 @@ namespace basic #undef USE_OMP_COLLAPSE -void NESTED_INIT::runOpenMPVariant(VariantID vid) +void NESTED_INIT::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/NESTED_INIT-OMPTarget.cpp b/src/basic/NESTED_INIT-OMPTarget.cpp index b140ede6a..94f8db8dc 100644 --- a/src/basic/NESTED_INIT-OMPTarget.cpp +++ b/src/basic/NESTED_INIT-OMPTarget.cpp @@ -32,7 +32,7 @@ namespace basic deallocOpenMPDeviceData(array, did); -void NESTED_INIT::runOpenMPTargetVariant(VariantID vid) +void NESTED_INIT::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/basic/NESTED_INIT-Seq.cpp b/src/basic/NESTED_INIT-Seq.cpp index 578a544e0..9e43984ad 100644 --- a/src/basic/NESTED_INIT-Seq.cpp +++ b/src/basic/NESTED_INIT-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void NESTED_INIT::runSeqVariant(VariantID vid) +void NESTED_INIT::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp index 472242480..bd7efd16b 100644 --- a/src/basic/NESTED_INIT.cpp +++ b/src/basic/NESTED_INIT.cpp @@ -24,10 +24,6 @@ namespace basic NESTED_INIT::NESTED_INIT(const RunParams& params) : KernelBase(rajaperf::Basic_NESTED_INIT, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - m_n_init = 100; setDefaultProblemSize(m_n_init * m_n_init * m_n_init); @@ -72,17 +68,17 @@ NESTED_INIT::~NESTED_INIT() { } -void NESTED_INIT::setUp(VariantID vid) +void NESTED_INIT::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_array, m_array_length, 0.0, vid); } -void NESTED_INIT::updateChecksum(VariantID vid) +void NESTED_INIT::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_array, m_array_length); } -void NESTED_INIT::tearDown(VariantID vid) +void NESTED_INIT::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; RAJA::free_aligned(m_array); diff --git a/src/basic/NESTED_INIT.hpp b/src/basic/NESTED_INIT.hpp index 2e711cfdd..dea1fb711 100644 --- a/src/basic/NESTED_INIT.hpp +++ b/src/basic/NESTED_INIT.hpp @@ -49,17 +49,18 @@ class NESTED_INIT : public KernelBase ~NESTED_INIT(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/basic/PI_ATOMIC-Cuda.cpp b/src/basic/PI_ATOMIC-Cuda.cpp index a048fad66..4032d59f0 100644 --- a/src/basic/PI_ATOMIC-Cuda.cpp +++ b/src/basic/PI_ATOMIC-Cuda.cpp @@ -125,7 +125,7 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) } } -void PI_ATOMIC::runCudaVariant(VariantID vid) +void PI_ATOMIC::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp index 2015e71f5..775972364 100644 --- a/src/basic/PI_ATOMIC-Hip.cpp +++ b/src/basic/PI_ATOMIC-Hip.cpp @@ -127,7 +127,7 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) } } -void PI_ATOMIC::runHipVariant(VariantID vid) +void PI_ATOMIC::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/basic/PI_ATOMIC-OMP.cpp b/src/basic/PI_ATOMIC-OMP.cpp index 555e22826..70c00b544 100644 --- a/src/basic/PI_ATOMIC-OMP.cpp +++ b/src/basic/PI_ATOMIC-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void PI_ATOMIC::runOpenMPVariant(VariantID vid) +void PI_ATOMIC::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/PI_ATOMIC-OMPTarget.cpp b/src/basic/PI_ATOMIC-OMPTarget.cpp index 0d6443423..dfa821883 100644 --- a/src/basic/PI_ATOMIC-OMPTarget.cpp +++ b/src/basic/PI_ATOMIC-OMPTarget.cpp @@ -36,7 +36,7 @@ namespace basic deallocOpenMPDeviceData(pi, did); -void PI_ATOMIC::runOpenMPTargetVariant(VariantID vid) +void PI_ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/PI_ATOMIC-Seq.cpp b/src/basic/PI_ATOMIC-Seq.cpp index 0fee34737..86cf14315 100644 --- a/src/basic/PI_ATOMIC-Seq.cpp +++ b/src/basic/PI_ATOMIC-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void PI_ATOMIC::runSeqVariant(VariantID vid) +void PI_ATOMIC::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp index f4e52ec8f..05db0af1c 100644 --- a/src/basic/PI_ATOMIC.cpp +++ b/src/basic/PI_ATOMIC.cpp @@ -21,10 +21,6 @@ namespace basic PI_ATOMIC::PI_ATOMIC(const RunParams& params) : KernelBase(rajaperf::Basic_PI_ATOMIC, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(50); @@ -63,19 +59,19 @@ PI_ATOMIC::~PI_ATOMIC() { } -void PI_ATOMIC::setUp(VariantID vid) +void PI_ATOMIC::setUp(VariantID vid, size_t /*tid*/) { m_dx = 1.0 / double(getActualProblemSize()); allocAndInitDataConst(m_pi, 1, 0.0, vid); m_pi_init = 0.0; } -void PI_ATOMIC::updateChecksum(VariantID vid) +void PI_ATOMIC::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += Checksum_type(*m_pi); } -void PI_ATOMIC::tearDown(VariantID vid) +void PI_ATOMIC::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_pi); diff --git a/src/basic/PI_ATOMIC.hpp b/src/basic/PI_ATOMIC.hpp index aedcd606c..1842ac2b5 100644 --- a/src/basic/PI_ATOMIC.hpp +++ b/src/basic/PI_ATOMIC.hpp @@ -45,17 +45,18 @@ class PI_ATOMIC : public KernelBase ~PI_ATOMIC(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index 5e25ddbad..efabb5aba 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -117,7 +117,7 @@ void PI_REDUCE::runCudaVariantImpl(VariantID vid) } } -void PI_REDUCE::runCudaVariant(VariantID vid) +void PI_REDUCE::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index f6ad239d2..fe88ef970 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -116,7 +116,7 @@ void PI_REDUCE::runHipVariantImpl(VariantID vid) } } -void PI_REDUCE::runHipVariant(VariantID vid) +void PI_REDUCE::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/basic/PI_REDUCE-OMP.cpp b/src/basic/PI_REDUCE-OMP.cpp index 56a7c59cd..6718ec500 100644 --- a/src/basic/PI_REDUCE-OMP.cpp +++ b/src/basic/PI_REDUCE-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void PI_REDUCE::runOpenMPVariant(VariantID vid) +void PI_REDUCE::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/PI_REDUCE-OMPTarget.cpp b/src/basic/PI_REDUCE-OMPTarget.cpp index 4f4870b5c..0f0bb2289 100644 --- a/src/basic/PI_REDUCE-OMPTarget.cpp +++ b/src/basic/PI_REDUCE-OMPTarget.cpp @@ -27,7 +27,7 @@ namespace basic const size_t threads_per_team = 256; -void PI_REDUCE::runOpenMPTargetVariant(VariantID vid) +void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/PI_REDUCE-Seq.cpp b/src/basic/PI_REDUCE-Seq.cpp index 577ebfb6b..997d74d90 100644 --- a/src/basic/PI_REDUCE-Seq.cpp +++ b/src/basic/PI_REDUCE-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void PI_REDUCE::runSeqVariant(VariantID vid) +void PI_REDUCE::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/PI_REDUCE.cpp b/src/basic/PI_REDUCE.cpp index af0ded096..9c6e6d7e0 100644 --- a/src/basic/PI_REDUCE.cpp +++ b/src/basic/PI_REDUCE.cpp @@ -21,10 +21,6 @@ namespace basic PI_REDUCE::PI_REDUCE(const RunParams& params) : KernelBase(rajaperf::Basic_PI_REDUCE, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(50); @@ -61,7 +57,7 @@ PI_REDUCE::~PI_REDUCE() { } -void PI_REDUCE::setUp(VariantID vid) +void PI_REDUCE::setUp(VariantID vid, size_t /*tid*/) { (void) vid; m_dx = 1.0 / double(getActualProblemSize()); @@ -69,12 +65,12 @@ void PI_REDUCE::setUp(VariantID vid) m_pi = 0.0; } -void PI_REDUCE::updateChecksum(VariantID vid) +void PI_REDUCE::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += Checksum_type(m_pi); } -void PI_REDUCE::tearDown(VariantID vid) +void PI_REDUCE::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; } diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp index 42c7a5dbe..990bf9509 100644 --- a/src/basic/PI_REDUCE.hpp +++ b/src/basic/PI_REDUCE.hpp @@ -47,17 +47,18 @@ class PI_REDUCE : public KernelBase ~PI_REDUCE(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index 9699f230b..a6e69c952 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -161,7 +161,7 @@ void REDUCE3_INT::runCudaVariantImpl(VariantID vid) } } -void REDUCE3_INT::runCudaVariant(VariantID vid) +void REDUCE3_INT::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index 0983db52b..ad0abdef9 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -161,7 +161,7 @@ void REDUCE3_INT::runHipVariantImpl(VariantID vid) } } -void REDUCE3_INT::runHipVariant(VariantID vid) +void REDUCE3_INT::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/basic/REDUCE3_INT-OMP.cpp b/src/basic/REDUCE3_INT-OMP.cpp index bb22a3ce8..876192a80 100644 --- a/src/basic/REDUCE3_INT-OMP.cpp +++ b/src/basic/REDUCE3_INT-OMP.cpp @@ -19,7 +19,7 @@ namespace basic { -void REDUCE3_INT::runOpenMPVariant(VariantID vid) +void REDUCE3_INT::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/REDUCE3_INT-OMPTarget.cpp b/src/basic/REDUCE3_INT-OMPTarget.cpp index 08f184510..be420fb4d 100644 --- a/src/basic/REDUCE3_INT-OMPTarget.cpp +++ b/src/basic/REDUCE3_INT-OMPTarget.cpp @@ -36,7 +36,7 @@ namespace basic deallocOpenMPDeviceData(vec, did); \ -void REDUCE3_INT::runOpenMPTargetVariant(VariantID vid) +void REDUCE3_INT::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/REDUCE3_INT-Seq.cpp b/src/basic/REDUCE3_INT-Seq.cpp index 7170cecd7..5f1ccc327 100644 --- a/src/basic/REDUCE3_INT-Seq.cpp +++ b/src/basic/REDUCE3_INT-Seq.cpp @@ -19,7 +19,7 @@ namespace basic { -void REDUCE3_INT::runSeqVariant(VariantID vid) +void REDUCE3_INT::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp index 7b6be51ce..49fcb8562 100644 --- a/src/basic/REDUCE3_INT.cpp +++ b/src/basic/REDUCE3_INT.cpp @@ -23,10 +23,6 @@ namespace basic REDUCE3_INT::REDUCE3_INT(const RunParams& params) : KernelBase(rajaperf::Basic_REDUCE3_INT, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); //setDefaultReps(5000); // Set reps to low value until we resolve RAJA omp-target @@ -66,7 +62,7 @@ REDUCE3_INT::~REDUCE3_INT() { } -void REDUCE3_INT::setUp(VariantID vid) +void REDUCE3_INT::setUp(VariantID vid, size_t /*tid*/) { allocAndInitData(m_vec, getActualProblemSize(), vid); @@ -78,14 +74,14 @@ void REDUCE3_INT::setUp(VariantID vid) m_vmax_init = std::numeric_limits::min(); } -void REDUCE3_INT::updateChecksum(VariantID vid) +void REDUCE3_INT::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += m_vsum; checksum[vid] += m_vmin; checksum[vid] += m_vmax; } -void REDUCE3_INT::tearDown(VariantID vid) +void REDUCE3_INT::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_vec); diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp index c51ddebe0..2d1e03230 100644 --- a/src/basic/REDUCE3_INT.hpp +++ b/src/basic/REDUCE3_INT.hpp @@ -61,17 +61,18 @@ class REDUCE3_INT : public KernelBase ~REDUCE3_INT(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index a94f819bf..717038de5 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -151,7 +151,7 @@ void TRAP_INT::runCudaVariantImpl(VariantID vid) } } -void TRAP_INT::runCudaVariant(VariantID vid) +void TRAP_INT::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index 87da36adc..4b1c834d7 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -150,7 +150,7 @@ void TRAP_INT::runHipVariantImpl(VariantID vid) } } -void TRAP_INT::runHipVariant(VariantID vid) +void TRAP_INT::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/basic/TRAP_INT-OMP.cpp b/src/basic/TRAP_INT-OMP.cpp index 19017fa76..8d58fcddd 100644 --- a/src/basic/TRAP_INT-OMP.cpp +++ b/src/basic/TRAP_INT-OMP.cpp @@ -32,7 +32,7 @@ Real_type trap_int_func(Real_type x, } -void TRAP_INT::runOpenMPVariant(VariantID vid) +void TRAP_INT::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/TRAP_INT-OMPTarget.cpp b/src/basic/TRAP_INT-OMPTarget.cpp index ad5717923..4b42fa4d0 100644 --- a/src/basic/TRAP_INT-OMPTarget.cpp +++ b/src/basic/TRAP_INT-OMPTarget.cpp @@ -46,7 +46,7 @@ Real_type trap_int_func(Real_type x, #define TRAP_INT_DATA_TEARDOWN_OMP_TARGET // nothing to do here... -void TRAP_INT::runOpenMPTargetVariant(VariantID vid) +void TRAP_INT::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/TRAP_INT-Seq.cpp b/src/basic/TRAP_INT-Seq.cpp index ba411e513..89feb42cc 100644 --- a/src/basic/TRAP_INT-Seq.cpp +++ b/src/basic/TRAP_INT-Seq.cpp @@ -32,7 +32,7 @@ Real_type trap_int_func(Real_type x, } -void TRAP_INT::runSeqVariant(VariantID vid) +void TRAP_INT::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp index b6c31aada..c1ec43ff4 100644 --- a/src/basic/TRAP_INT.cpp +++ b/src/basic/TRAP_INT.cpp @@ -21,10 +21,6 @@ namespace basic TRAP_INT::TRAP_INT(const RunParams& params) : KernelBase(rajaperf::Basic_TRAP_INT, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(50); @@ -61,7 +57,7 @@ TRAP_INT::~TRAP_INT() { } -void TRAP_INT::setUp(VariantID vid) +void TRAP_INT::setUp(VariantID vid, size_t /*tid*/) { Real_type xn; initData(xn, vid); @@ -78,12 +74,12 @@ void TRAP_INT::setUp(VariantID vid) m_sumx = 0; } -void TRAP_INT::updateChecksum(VariantID vid) +void TRAP_INT::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += m_sumx; } -void TRAP_INT::tearDown(VariantID vid) +void TRAP_INT::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; } diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp index 8bb8f5f7d..37be87970 100644 --- a/src/basic/TRAP_INT.hpp +++ b/src/basic/TRAP_INT.hpp @@ -58,17 +58,18 @@ class TRAP_INT : public KernelBase ~TRAP_INT(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 518c1d9b8..3afcd8bf5 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -153,13 +153,13 @@ void KernelBase::execute(VariantID vid, size_t tid) resetTimer(); resetDataInitCount(); - this->setUp(vid); + this->setUp(vid, tid); this->runKernel(vid, tid); this->updateChecksum(vid, tid); - this->tearDown(vid); + this->tearDown(vid, tid); running_variant = NumVariants; running_tuning = std::numeric_limits::max(); diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 6e4b63feb..5465c5aa5 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -204,9 +204,9 @@ class KernelBase virtual void runKernel(VariantID vid, size_t tid); - virtual void setUp(VariantID vid) = 0; + virtual void setUp(VariantID vid, size_t tid) = 0; virtual void updateChecksum(VariantID vid, size_t tid) = 0; - virtual void tearDown(VariantID vid) = 0; + virtual void tearDown(VariantID vid, size_t tid) = 0; virtual void runSeqVariant(VariantID vid, size_t tid) = 0; #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/DIFF_PREDICT-Cuda.cpp b/src/lcals/DIFF_PREDICT-Cuda.cpp index b9b22bb7d..68adea902 100644 --- a/src/lcals/DIFF_PREDICT-Cuda.cpp +++ b/src/lcals/DIFF_PREDICT-Cuda.cpp @@ -92,7 +92,7 @@ void DIFF_PREDICT::runCudaVariantImpl(VariantID vid) } } -void DIFF_PREDICT::runCudaVariant(VariantID vid) +void DIFF_PREDICT::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/lcals/DIFF_PREDICT-Hip.cpp b/src/lcals/DIFF_PREDICT-Hip.cpp index a450a8bb6..27bf9d348 100644 --- a/src/lcals/DIFF_PREDICT-Hip.cpp +++ b/src/lcals/DIFF_PREDICT-Hip.cpp @@ -92,7 +92,7 @@ void DIFF_PREDICT::runHipVariantImpl(VariantID vid) } } -void DIFF_PREDICT::runHipVariant(VariantID vid) +void DIFF_PREDICT::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/lcals/DIFF_PREDICT-OMP.cpp b/src/lcals/DIFF_PREDICT-OMP.cpp index ebe91fe92..a40a4e88d 100644 --- a/src/lcals/DIFF_PREDICT-OMP.cpp +++ b/src/lcals/DIFF_PREDICT-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void DIFF_PREDICT::runOpenMPVariant(VariantID vid) +void DIFF_PREDICT::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/DIFF_PREDICT-OMPTarget.cpp b/src/lcals/DIFF_PREDICT-OMPTarget.cpp index 60e6c45e7..348fb24cb 100644 --- a/src/lcals/DIFF_PREDICT-OMPTarget.cpp +++ b/src/lcals/DIFF_PREDICT-OMPTarget.cpp @@ -39,7 +39,7 @@ namespace lcals deallocOpenMPDeviceData(cx, did); -void DIFF_PREDICT::runOpenMPTargetVariant(VariantID vid) +void DIFF_PREDICT::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/DIFF_PREDICT-Seq.cpp b/src/lcals/DIFF_PREDICT-Seq.cpp index bf74477e7..3ddbf1db1 100644 --- a/src/lcals/DIFF_PREDICT-Seq.cpp +++ b/src/lcals/DIFF_PREDICT-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void DIFF_PREDICT::runSeqVariant(VariantID vid) +void DIFF_PREDICT::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp index 2f13f36cb..0cce0a80d 100644 --- a/src/lcals/DIFF_PREDICT.cpp +++ b/src/lcals/DIFF_PREDICT.cpp @@ -20,10 +20,6 @@ namespace lcals DIFF_PREDICT::DIFF_PREDICT(const RunParams& params) : KernelBase(rajaperf::Lcals_DIFF_PREDICT, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(200); @@ -59,7 +55,7 @@ DIFF_PREDICT::~DIFF_PREDICT() { } -void DIFF_PREDICT::setUp(VariantID vid) +void DIFF_PREDICT::setUp(VariantID vid, size_t /*tid*/) { m_array_length = getActualProblemSize() * 14; m_offset = getActualProblemSize(); @@ -68,12 +64,12 @@ void DIFF_PREDICT::setUp(VariantID vid) allocAndInitData(m_cx, m_array_length, vid); } -void DIFF_PREDICT::updateChecksum(VariantID vid) +void DIFF_PREDICT::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_px, m_array_length); } -void DIFF_PREDICT::tearDown(VariantID vid) +void DIFF_PREDICT::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_px); diff --git a/src/lcals/DIFF_PREDICT.hpp b/src/lcals/DIFF_PREDICT.hpp index 6e5c0203c..86f6f91d8 100644 --- a/src/lcals/DIFF_PREDICT.hpp +++ b/src/lcals/DIFF_PREDICT.hpp @@ -84,17 +84,18 @@ class DIFF_PREDICT : public KernelBase ~DIFF_PREDICT(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/lcals/EOS-Cuda.cpp b/src/lcals/EOS-Cuda.cpp index 144f118ee..0de00b2d5 100644 --- a/src/lcals/EOS-Cuda.cpp +++ b/src/lcals/EOS-Cuda.cpp @@ -96,7 +96,7 @@ void EOS::runCudaVariantImpl(VariantID vid) } } -void EOS::runCudaVariant(VariantID vid) +void EOS::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/lcals/EOS-Hip.cpp b/src/lcals/EOS-Hip.cpp index 8f79b7d71..add8f5e22 100644 --- a/src/lcals/EOS-Hip.cpp +++ b/src/lcals/EOS-Hip.cpp @@ -96,7 +96,7 @@ void EOS::runHipVariantImpl(VariantID vid) } } -void EOS::runHipVariant(VariantID vid) +void EOS::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/lcals/EOS-OMP.cpp b/src/lcals/EOS-OMP.cpp index c5a8c8490..28947b754 100644 --- a/src/lcals/EOS-OMP.cpp +++ b/src/lcals/EOS-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void EOS::runOpenMPVariant(VariantID vid) +void EOS::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/EOS-OMPTarget.cpp b/src/lcals/EOS-OMPTarget.cpp index b0f2fe008..c6efd7a00 100644 --- a/src/lcals/EOS-OMPTarget.cpp +++ b/src/lcals/EOS-OMPTarget.cpp @@ -43,7 +43,7 @@ namespace lcals deallocOpenMPDeviceData(u, did); -void EOS::runOpenMPTargetVariant(VariantID vid) +void EOS::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/EOS-Seq.cpp b/src/lcals/EOS-Seq.cpp index 66f308c2b..9f26a9d06 100644 --- a/src/lcals/EOS-Seq.cpp +++ b/src/lcals/EOS-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void EOS::runSeqVariant(VariantID vid) +void EOS::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp index 033a1a4fb..87d47e605 100644 --- a/src/lcals/EOS.cpp +++ b/src/lcals/EOS.cpp @@ -21,10 +21,6 @@ namespace lcals EOS::EOS(const RunParams& params) : KernelBase(rajaperf::Lcals_EOS, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(500); @@ -67,7 +63,7 @@ EOS::~EOS() { } -void EOS::setUp(VariantID vid) +void EOS::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_x, m_array_length, 0.0, vid); allocAndInitData(m_y, m_array_length, vid); @@ -79,12 +75,12 @@ void EOS::setUp(VariantID vid) initData(m_t, vid); } -void EOS::updateChecksum(VariantID vid) +void EOS::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_x, getActualProblemSize(), checksum_scale_factor ); } -void EOS::tearDown(VariantID vid) +void EOS::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_x); diff --git a/src/lcals/EOS.hpp b/src/lcals/EOS.hpp index 5c4ac5ff1..909b2e569 100644 --- a/src/lcals/EOS.hpp +++ b/src/lcals/EOS.hpp @@ -53,17 +53,18 @@ class EOS : public KernelBase ~EOS(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/lcals/FIRST_DIFF-Cuda.cpp b/src/lcals/FIRST_DIFF-Cuda.cpp index 7d18c922c..a8e92b781 100644 --- a/src/lcals/FIRST_DIFF-Cuda.cpp +++ b/src/lcals/FIRST_DIFF-Cuda.cpp @@ -90,7 +90,7 @@ void FIRST_DIFF::runCudaVariantImpl(VariantID vid) } } -void FIRST_DIFF::runCudaVariant(VariantID vid) +void FIRST_DIFF::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/lcals/FIRST_DIFF-Hip.cpp b/src/lcals/FIRST_DIFF-Hip.cpp index acc91122a..f38333511 100644 --- a/src/lcals/FIRST_DIFF-Hip.cpp +++ b/src/lcals/FIRST_DIFF-Hip.cpp @@ -90,7 +90,7 @@ void FIRST_DIFF::runHipVariantImpl(VariantID vid) } } -void FIRST_DIFF::runHipVariant(VariantID vid) +void FIRST_DIFF::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/lcals/FIRST_DIFF-OMP.cpp b/src/lcals/FIRST_DIFF-OMP.cpp index ae2a2e995..e805df8f5 100644 --- a/src/lcals/FIRST_DIFF-OMP.cpp +++ b/src/lcals/FIRST_DIFF-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void FIRST_DIFF::runOpenMPVariant(VariantID vid) +void FIRST_DIFF::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/FIRST_DIFF-OMPTarget.cpp b/src/lcals/FIRST_DIFF-OMPTarget.cpp index 0688731d4..09958a0d0 100644 --- a/src/lcals/FIRST_DIFF-OMPTarget.cpp +++ b/src/lcals/FIRST_DIFF-OMPTarget.cpp @@ -39,7 +39,7 @@ namespace lcals deallocOpenMPDeviceData(y, did); -void FIRST_DIFF::runOpenMPTargetVariant(VariantID vid) +void FIRST_DIFF::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/FIRST_DIFF-Seq.cpp b/src/lcals/FIRST_DIFF-Seq.cpp index 62b43af09..6eaa2244e 100644 --- a/src/lcals/FIRST_DIFF-Seq.cpp +++ b/src/lcals/FIRST_DIFF-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void FIRST_DIFF::runSeqVariant(VariantID vid) +void FIRST_DIFF::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp index f9a62b683..0fd301c18 100644 --- a/src/lcals/FIRST_DIFF.cpp +++ b/src/lcals/FIRST_DIFF.cpp @@ -21,10 +21,6 @@ namespace lcals FIRST_DIFF::FIRST_DIFF(const RunParams& params) : KernelBase(rajaperf::Lcals_FIRST_DIFF, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(2000); @@ -63,18 +59,18 @@ FIRST_DIFF::~FIRST_DIFF() { } -void FIRST_DIFF::setUp(VariantID vid) +void FIRST_DIFF::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_x, m_N, 0.0, vid); allocAndInitData(m_y, m_N, vid); } -void FIRST_DIFF::updateChecksum(VariantID vid) +void FIRST_DIFF::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_x, getActualProblemSize()); } -void FIRST_DIFF::tearDown(VariantID vid) +void FIRST_DIFF::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_x); diff --git a/src/lcals/FIRST_DIFF.hpp b/src/lcals/FIRST_DIFF.hpp index 5f18a2037..efd3282d3 100644 --- a/src/lcals/FIRST_DIFF.hpp +++ b/src/lcals/FIRST_DIFF.hpp @@ -43,17 +43,18 @@ class FIRST_DIFF : public KernelBase ~FIRST_DIFF(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp index 3021a98f3..0d16e24d6 100644 --- a/src/lcals/FIRST_MIN-Cuda.cpp +++ b/src/lcals/FIRST_MIN-Cuda.cpp @@ -130,7 +130,7 @@ void FIRST_MIN::runCudaVariantImpl(VariantID vid) } } -void FIRST_MIN::runCudaVariant(VariantID vid) +void FIRST_MIN::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp index 06159f478..7aaf2a144 100644 --- a/src/lcals/FIRST_MIN-Hip.cpp +++ b/src/lcals/FIRST_MIN-Hip.cpp @@ -130,7 +130,7 @@ void FIRST_MIN::runHipVariantImpl(VariantID vid) } } -void FIRST_MIN::runHipVariant(VariantID vid) +void FIRST_MIN::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/lcals/FIRST_MIN-OMP.cpp b/src/lcals/FIRST_MIN-OMP.cpp index 9ebc5f326..ae380d555 100644 --- a/src/lcals/FIRST_MIN-OMP.cpp +++ b/src/lcals/FIRST_MIN-OMP.cpp @@ -19,7 +19,7 @@ namespace lcals FIRST_MIN_MINLOC_COMPARE; -void FIRST_MIN::runOpenMPVariant(VariantID vid) +void FIRST_MIN::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/FIRST_MIN-OMPTarget.cpp b/src/lcals/FIRST_MIN-OMPTarget.cpp index 274e9affe..61a1c8d81 100644 --- a/src/lcals/FIRST_MIN-OMPTarget.cpp +++ b/src/lcals/FIRST_MIN-OMPTarget.cpp @@ -37,7 +37,7 @@ namespace lcals FIRST_MIN_MINLOC_COMPARE; -void FIRST_MIN::runOpenMPTargetVariant(VariantID vid) +void FIRST_MIN::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/FIRST_MIN-Seq.cpp b/src/lcals/FIRST_MIN-Seq.cpp index fec75aadc..1a44e9b05 100644 --- a/src/lcals/FIRST_MIN-Seq.cpp +++ b/src/lcals/FIRST_MIN-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void FIRST_MIN::runSeqVariant(VariantID vid) +void FIRST_MIN::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp index 42bdae851..7c1409b4d 100644 --- a/src/lcals/FIRST_MIN.cpp +++ b/src/lcals/FIRST_MIN.cpp @@ -21,10 +21,6 @@ namespace lcals FIRST_MIN::FIRST_MIN(const RunParams& params) : KernelBase(rajaperf::Lcals_FIRST_MIN, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); //setDefaultReps(1000); // Set reps to low value until we resolve RAJA omp-target @@ -67,7 +63,7 @@ FIRST_MIN::~FIRST_MIN() { } -void FIRST_MIN::setUp(VariantID vid) +void FIRST_MIN::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_x, m_N, 0.0, vid); m_x[ m_N / 2 ] = -1.0e+10; @@ -76,12 +72,12 @@ void FIRST_MIN::setUp(VariantID vid) m_minloc = -1; } -void FIRST_MIN::updateChecksum(VariantID vid) +void FIRST_MIN::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += static_cast(m_minloc); } -void FIRST_MIN::tearDown(VariantID vid) +void FIRST_MIN::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_x); diff --git a/src/lcals/FIRST_MIN.hpp b/src/lcals/FIRST_MIN.hpp index ea111c0a8..86dc247f7 100644 --- a/src/lcals/FIRST_MIN.hpp +++ b/src/lcals/FIRST_MIN.hpp @@ -72,17 +72,18 @@ class FIRST_MIN : public KernelBase ~FIRST_MIN(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/lcals/FIRST_SUM-Cuda.cpp b/src/lcals/FIRST_SUM-Cuda.cpp index c82f47458..1422977f7 100644 --- a/src/lcals/FIRST_SUM-Cuda.cpp +++ b/src/lcals/FIRST_SUM-Cuda.cpp @@ -90,7 +90,7 @@ void FIRST_SUM::runCudaVariantImpl(VariantID vid) } } -void FIRST_SUM::runCudaVariant(VariantID vid) +void FIRST_SUM::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/lcals/FIRST_SUM-Hip.cpp b/src/lcals/FIRST_SUM-Hip.cpp index 2fe27e866..04e761a33 100644 --- a/src/lcals/FIRST_SUM-Hip.cpp +++ b/src/lcals/FIRST_SUM-Hip.cpp @@ -90,7 +90,7 @@ void FIRST_SUM::runHipVariantImpl(VariantID vid) } } -void FIRST_SUM::runHipVariant(VariantID vid) +void FIRST_SUM::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/lcals/FIRST_SUM-OMP.cpp b/src/lcals/FIRST_SUM-OMP.cpp index b9905666a..53e4e8821 100644 --- a/src/lcals/FIRST_SUM-OMP.cpp +++ b/src/lcals/FIRST_SUM-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void FIRST_SUM::runOpenMPVariant(VariantID vid) +void FIRST_SUM::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/FIRST_SUM-OMPTarget.cpp b/src/lcals/FIRST_SUM-OMPTarget.cpp index 19344df4c..9e7bf41e5 100644 --- a/src/lcals/FIRST_SUM-OMPTarget.cpp +++ b/src/lcals/FIRST_SUM-OMPTarget.cpp @@ -39,7 +39,7 @@ namespace lcals deallocOpenMPDeviceData(y, did); -void FIRST_SUM::runOpenMPTargetVariant(VariantID vid) +void FIRST_SUM::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; diff --git a/src/lcals/FIRST_SUM-Seq.cpp b/src/lcals/FIRST_SUM-Seq.cpp index cbb96a695..d88a830d8 100644 --- a/src/lcals/FIRST_SUM-Seq.cpp +++ b/src/lcals/FIRST_SUM-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void FIRST_SUM::runSeqVariant(VariantID vid) +void FIRST_SUM::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp index 46b2817f6..a028c1c0f 100644 --- a/src/lcals/FIRST_SUM.cpp +++ b/src/lcals/FIRST_SUM.cpp @@ -21,10 +21,6 @@ namespace lcals FIRST_SUM::FIRST_SUM(const RunParams& params) : KernelBase(rajaperf::Lcals_FIRST_SUM, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(2000); @@ -62,18 +58,18 @@ FIRST_SUM::~FIRST_SUM() { } -void FIRST_SUM::setUp(VariantID vid) +void FIRST_SUM::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_x, m_N, 0.0, vid); allocAndInitData(m_y, m_N, vid); } -void FIRST_SUM::updateChecksum(VariantID vid) +void FIRST_SUM::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_x, getActualProblemSize()); } -void FIRST_SUM::tearDown(VariantID vid) +void FIRST_SUM::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_x); diff --git a/src/lcals/FIRST_SUM.hpp b/src/lcals/FIRST_SUM.hpp index da618e10d..9dcd7ea3b 100644 --- a/src/lcals/FIRST_SUM.hpp +++ b/src/lcals/FIRST_SUM.hpp @@ -46,17 +46,18 @@ class FIRST_SUM : public KernelBase ~FIRST_SUM(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/lcals/GEN_LIN_RECUR-Cuda.cpp b/src/lcals/GEN_LIN_RECUR-Cuda.cpp index 3477e90f0..31e30a042 100644 --- a/src/lcals/GEN_LIN_RECUR-Cuda.cpp +++ b/src/lcals/GEN_LIN_RECUR-Cuda.cpp @@ -119,7 +119,7 @@ void GEN_LIN_RECUR::runCudaVariantImpl(VariantID vid) } } -void GEN_LIN_RECUR::runCudaVariant(VariantID vid) +void GEN_LIN_RECUR::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/lcals/GEN_LIN_RECUR-Hip.cpp b/src/lcals/GEN_LIN_RECUR-Hip.cpp index cec537075..6d6a2843c 100644 --- a/src/lcals/GEN_LIN_RECUR-Hip.cpp +++ b/src/lcals/GEN_LIN_RECUR-Hip.cpp @@ -121,7 +121,7 @@ void GEN_LIN_RECUR::runHipVariantImpl(VariantID vid) } } -void GEN_LIN_RECUR::runHipVariant(VariantID vid) +void GEN_LIN_RECUR::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/lcals/GEN_LIN_RECUR-OMP.cpp b/src/lcals/GEN_LIN_RECUR-OMP.cpp index 087fc82f3..3f39d2324 100644 --- a/src/lcals/GEN_LIN_RECUR-OMP.cpp +++ b/src/lcals/GEN_LIN_RECUR-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void GEN_LIN_RECUR::runOpenMPVariant(VariantID vid) +void GEN_LIN_RECUR::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp b/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp index 7cb0fd72b..94c23f627 100644 --- a/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp +++ b/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp @@ -43,7 +43,7 @@ namespace lcals deallocOpenMPDeviceData(sb, did); -void GEN_LIN_RECUR::runOpenMPTargetVariant(VariantID vid) +void GEN_LIN_RECUR::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/lcals/GEN_LIN_RECUR-Seq.cpp b/src/lcals/GEN_LIN_RECUR-Seq.cpp index f7ed58734..a3f636f50 100644 --- a/src/lcals/GEN_LIN_RECUR-Seq.cpp +++ b/src/lcals/GEN_LIN_RECUR-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void GEN_LIN_RECUR::runSeqVariant(VariantID vid) +void GEN_LIN_RECUR::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/lcals/GEN_LIN_RECUR.cpp b/src/lcals/GEN_LIN_RECUR.cpp index 9ef868efb..1cae28e86 100644 --- a/src/lcals/GEN_LIN_RECUR.cpp +++ b/src/lcals/GEN_LIN_RECUR.cpp @@ -21,10 +21,6 @@ namespace lcals GEN_LIN_RECUR::GEN_LIN_RECUR(const RunParams& params) : KernelBase(rajaperf::Lcals_GEN_LIN_RECUR, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(500); @@ -67,7 +63,7 @@ GEN_LIN_RECUR::~GEN_LIN_RECUR() { } -void GEN_LIN_RECUR::setUp(VariantID vid) +void GEN_LIN_RECUR::setUp(VariantID vid, size_t /*tid*/) { m_kb5i = 0; @@ -77,12 +73,12 @@ void GEN_LIN_RECUR::setUp(VariantID vid) allocAndInitData(m_sb, m_N, vid); } -void GEN_LIN_RECUR::updateChecksum(VariantID vid) +void GEN_LIN_RECUR::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_b5, getActualProblemSize(), checksum_scale_factor ); } -void GEN_LIN_RECUR::tearDown(VariantID vid) +void GEN_LIN_RECUR::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_b5); diff --git a/src/lcals/GEN_LIN_RECUR.hpp b/src/lcals/GEN_LIN_RECUR.hpp index 6cf271159..ec82c2520 100644 --- a/src/lcals/GEN_LIN_RECUR.hpp +++ b/src/lcals/GEN_LIN_RECUR.hpp @@ -67,17 +67,18 @@ class GEN_LIN_RECUR : public KernelBase ~GEN_LIN_RECUR(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/lcals/HYDRO_1D-Cuda.cpp b/src/lcals/HYDRO_1D-Cuda.cpp index d4c8db33a..ea2874d56 100644 --- a/src/lcals/HYDRO_1D-Cuda.cpp +++ b/src/lcals/HYDRO_1D-Cuda.cpp @@ -94,7 +94,7 @@ void HYDRO_1D::runCudaVariantImpl(VariantID vid) } } -void HYDRO_1D::runCudaVariant(VariantID vid) +void HYDRO_1D::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/lcals/HYDRO_1D-Hip.cpp b/src/lcals/HYDRO_1D-Hip.cpp index 7582548be..4947f3a18 100644 --- a/src/lcals/HYDRO_1D-Hip.cpp +++ b/src/lcals/HYDRO_1D-Hip.cpp @@ -94,7 +94,7 @@ void HYDRO_1D::runHipVariantImpl(VariantID vid) } } -void HYDRO_1D::runHipVariant(VariantID vid) +void HYDRO_1D::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/lcals/HYDRO_1D-OMP.cpp b/src/lcals/HYDRO_1D-OMP.cpp index 94390485f..0a1b5d1c1 100644 --- a/src/lcals/HYDRO_1D-OMP.cpp +++ b/src/lcals/HYDRO_1D-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void HYDRO_1D::runOpenMPVariant(VariantID vid) +void HYDRO_1D::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/HYDRO_1D-OMPTarget.cpp b/src/lcals/HYDRO_1D-OMPTarget.cpp index c313951b5..f08fb7f53 100644 --- a/src/lcals/HYDRO_1D-OMPTarget.cpp +++ b/src/lcals/HYDRO_1D-OMPTarget.cpp @@ -41,7 +41,7 @@ namespace lcals deallocOpenMPDeviceData(z, did); \ -void HYDRO_1D::runOpenMPTargetVariant(VariantID vid) +void HYDRO_1D::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/HYDRO_1D-Seq.cpp b/src/lcals/HYDRO_1D-Seq.cpp index a257b08bb..83ffa84f5 100644 --- a/src/lcals/HYDRO_1D-Seq.cpp +++ b/src/lcals/HYDRO_1D-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void HYDRO_1D::runSeqVariant(VariantID vid) +void HYDRO_1D::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp index 6b757ab57..137f70fad 100644 --- a/src/lcals/HYDRO_1D.cpp +++ b/src/lcals/HYDRO_1D.cpp @@ -21,10 +21,6 @@ namespace lcals HYDRO_1D::HYDRO_1D(const RunParams& params) : KernelBase(rajaperf::Lcals_HYDRO_1D, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(1000); @@ -66,7 +62,7 @@ HYDRO_1D::~HYDRO_1D() { } -void HYDRO_1D::setUp(VariantID vid) +void HYDRO_1D::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_x, m_array_length, 0.0, vid); allocAndInitData(m_y, m_array_length, vid); @@ -77,12 +73,12 @@ void HYDRO_1D::setUp(VariantID vid) initData(m_t, vid); } -void HYDRO_1D::updateChecksum(VariantID vid) +void HYDRO_1D::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_x, getActualProblemSize(), checksum_scale_factor ); } -void HYDRO_1D::tearDown(VariantID vid) +void HYDRO_1D::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_x); diff --git a/src/lcals/HYDRO_1D.hpp b/src/lcals/HYDRO_1D.hpp index 27df3ecbe..074211f17 100644 --- a/src/lcals/HYDRO_1D.hpp +++ b/src/lcals/HYDRO_1D.hpp @@ -48,17 +48,18 @@ class HYDRO_1D : public KernelBase ~HYDRO_1D(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/lcals/HYDRO_2D-Cuda.cpp b/src/lcals/HYDRO_2D-Cuda.cpp index 237534ceb..a4c39eb77 100644 --- a/src/lcals/HYDRO_2D-Cuda.cpp +++ b/src/lcals/HYDRO_2D-Cuda.cpp @@ -221,7 +221,7 @@ void HYDRO_2D::runCudaVariantImpl(VariantID vid) } } -void HYDRO_2D::runCudaVariant(VariantID vid) +void HYDRO_2D::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/lcals/HYDRO_2D-Hip.cpp b/src/lcals/HYDRO_2D-Hip.cpp index 8b96804e7..b7e8589f7 100644 --- a/src/lcals/HYDRO_2D-Hip.cpp +++ b/src/lcals/HYDRO_2D-Hip.cpp @@ -223,7 +223,7 @@ void HYDRO_2D::runHipVariantImpl(VariantID vid) } } -void HYDRO_2D::runHipVariant(VariantID vid) +void HYDRO_2D::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/lcals/HYDRO_2D-OMP.cpp b/src/lcals/HYDRO_2D-OMP.cpp index d22502523..107722e20 100644 --- a/src/lcals/HYDRO_2D-OMP.cpp +++ b/src/lcals/HYDRO_2D-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void HYDRO_2D::runOpenMPVariant(VariantID vid) +void HYDRO_2D::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/HYDRO_2D-OMPTarget.cpp b/src/lcals/HYDRO_2D-OMPTarget.cpp index ccac11396..cf376e8f0 100644 --- a/src/lcals/HYDRO_2D-OMPTarget.cpp +++ b/src/lcals/HYDRO_2D-OMPTarget.cpp @@ -54,7 +54,7 @@ namespace lcals -void HYDRO_2D::runOpenMPTargetVariant(VariantID vid) +void HYDRO_2D::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type kbeg = 1; diff --git a/src/lcals/HYDRO_2D-Seq.cpp b/src/lcals/HYDRO_2D-Seq.cpp index 023819bec..5229bbc4c 100644 --- a/src/lcals/HYDRO_2D-Seq.cpp +++ b/src/lcals/HYDRO_2D-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void HYDRO_2D::runSeqVariant(VariantID vid) +void HYDRO_2D::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type kbeg = 1; diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp index 8f3ccc37a..8e4f0c8c3 100644 --- a/src/lcals/HYDRO_2D.cpp +++ b/src/lcals/HYDRO_2D.cpp @@ -24,10 +24,6 @@ namespace lcals HYDRO_2D::HYDRO_2D(const RunParams& params) : KernelBase(rajaperf::Lcals_HYDRO_2D, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - m_jn = 1000; m_kn = 1000; @@ -81,7 +77,7 @@ HYDRO_2D::~HYDRO_2D() { } -void HYDRO_2D::setUp(VariantID vid) +void HYDRO_2D::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_zrout, m_array_length, 0.0, vid); allocAndInitDataConst(m_zzout, m_array_length, 0.0, vid); @@ -96,13 +92,13 @@ void HYDRO_2D::setUp(VariantID vid) allocAndInitData(m_zz, m_array_length, vid); } -void HYDRO_2D::updateChecksum(VariantID vid) +void HYDRO_2D::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_zzout, m_array_length, checksum_scale_factor ); checksum[vid] += calcChecksum(m_zrout, m_array_length, checksum_scale_factor ); } -void HYDRO_2D::tearDown(VariantID vid) +void HYDRO_2D::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_zrout); diff --git a/src/lcals/HYDRO_2D.hpp b/src/lcals/HYDRO_2D.hpp index a5f43f7f2..1f59270d1 100644 --- a/src/lcals/HYDRO_2D.hpp +++ b/src/lcals/HYDRO_2D.hpp @@ -144,17 +144,18 @@ class HYDRO_2D : public KernelBase ~HYDRO_2D(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/lcals/INT_PREDICT-Cuda.cpp b/src/lcals/INT_PREDICT-Cuda.cpp index 1da0001e6..b9b00f969 100644 --- a/src/lcals/INT_PREDICT-Cuda.cpp +++ b/src/lcals/INT_PREDICT-Cuda.cpp @@ -95,7 +95,7 @@ void INT_PREDICT::runCudaVariantImpl(VariantID vid) } } -void INT_PREDICT::runCudaVariant(VariantID vid) +void INT_PREDICT::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/lcals/INT_PREDICT-Hip.cpp b/src/lcals/INT_PREDICT-Hip.cpp index 2d886d698..7e194659e 100644 --- a/src/lcals/INT_PREDICT-Hip.cpp +++ b/src/lcals/INT_PREDICT-Hip.cpp @@ -95,7 +95,7 @@ void INT_PREDICT::runHipVariantImpl(VariantID vid) } } -void INT_PREDICT::runHipVariant(VariantID vid) +void INT_PREDICT::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/lcals/INT_PREDICT-OMP.cpp b/src/lcals/INT_PREDICT-OMP.cpp index 486d9178e..801aef158 100644 --- a/src/lcals/INT_PREDICT-OMP.cpp +++ b/src/lcals/INT_PREDICT-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void INT_PREDICT::runOpenMPVariant(VariantID vid) +void INT_PREDICT::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/INT_PREDICT-OMPTarget.cpp b/src/lcals/INT_PREDICT-OMPTarget.cpp index 4fcc54307..4d4950a74 100644 --- a/src/lcals/INT_PREDICT-OMPTarget.cpp +++ b/src/lcals/INT_PREDICT-OMPTarget.cpp @@ -37,7 +37,7 @@ namespace lcals deallocOpenMPDeviceData(px, did); -void INT_PREDICT::runOpenMPTargetVariant(VariantID vid) +void INT_PREDICT::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/INT_PREDICT-Seq.cpp b/src/lcals/INT_PREDICT-Seq.cpp index 61032c6db..56fab4fd1 100644 --- a/src/lcals/INT_PREDICT-Seq.cpp +++ b/src/lcals/INT_PREDICT-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void INT_PREDICT::runSeqVariant(VariantID vid) +void INT_PREDICT::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp index d828a6cf3..a8aa25e1f 100644 --- a/src/lcals/INT_PREDICT.cpp +++ b/src/lcals/INT_PREDICT.cpp @@ -21,10 +21,6 @@ namespace lcals INT_PREDICT::INT_PREDICT(const RunParams& params) : KernelBase(rajaperf::Lcals_INT_PREDICT, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(400); @@ -59,7 +55,7 @@ INT_PREDICT::~INT_PREDICT() { } -void INT_PREDICT::setUp(VariantID vid) +void INT_PREDICT::setUp(VariantID vid, size_t /*tid*/) { m_array_length = getActualProblemSize() * 13; m_offset = getActualProblemSize(); @@ -77,7 +73,7 @@ void INT_PREDICT::setUp(VariantID vid) initData(m_c0); } -void INT_PREDICT::updateChecksum(VariantID vid) +void INT_PREDICT::updateChecksum(VariantID vid, size_t tid) { for (Index_type i = 0; i < getActualProblemSize(); ++i) { m_px[i] -= m_px_initval; @@ -86,7 +82,7 @@ void INT_PREDICT::updateChecksum(VariantID vid) checksum[vid] += calcChecksum(m_px, getActualProblemSize()); } -void INT_PREDICT::tearDown(VariantID vid) +void INT_PREDICT::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_px); diff --git a/src/lcals/INT_PREDICT.hpp b/src/lcals/INT_PREDICT.hpp index eab03a090..5fd69d2ed 100644 --- a/src/lcals/INT_PREDICT.hpp +++ b/src/lcals/INT_PREDICT.hpp @@ -63,17 +63,18 @@ class INT_PREDICT : public KernelBase ~INT_PREDICT(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/lcals/PLANCKIAN-Cuda.cpp b/src/lcals/PLANCKIAN-Cuda.cpp index 6c297e843..bbafd6b91 100644 --- a/src/lcals/PLANCKIAN-Cuda.cpp +++ b/src/lcals/PLANCKIAN-Cuda.cpp @@ -99,7 +99,7 @@ void PLANCKIAN::runCudaVariantImpl(VariantID vid) } } -void PLANCKIAN::runCudaVariant(VariantID vid) +void PLANCKIAN::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/lcals/PLANCKIAN-Hip.cpp b/src/lcals/PLANCKIAN-Hip.cpp index adfe275c7..b4369610f 100644 --- a/src/lcals/PLANCKIAN-Hip.cpp +++ b/src/lcals/PLANCKIAN-Hip.cpp @@ -99,7 +99,7 @@ void PLANCKIAN::runHipVariantImpl(VariantID vid) } } -void PLANCKIAN::runHipVariant(VariantID vid) +void PLANCKIAN::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/lcals/PLANCKIAN-OMP.cpp b/src/lcals/PLANCKIAN-OMP.cpp index 8a890654e..c5af075fd 100644 --- a/src/lcals/PLANCKIAN-OMP.cpp +++ b/src/lcals/PLANCKIAN-OMP.cpp @@ -19,7 +19,7 @@ namespace lcals { -void PLANCKIAN::runOpenMPVariant(VariantID vid) +void PLANCKIAN::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/PLANCKIAN-OMPTarget.cpp b/src/lcals/PLANCKIAN-OMPTarget.cpp index f6471f7b6..56254d8cf 100644 --- a/src/lcals/PLANCKIAN-OMPTarget.cpp +++ b/src/lcals/PLANCKIAN-OMPTarget.cpp @@ -46,7 +46,7 @@ namespace lcals deallocOpenMPDeviceData(w, did); -void PLANCKIAN::runOpenMPTargetVariant(VariantID vid) +void PLANCKIAN::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/PLANCKIAN-Seq.cpp b/src/lcals/PLANCKIAN-Seq.cpp index 88bcc04e0..e853c3d45 100644 --- a/src/lcals/PLANCKIAN-Seq.cpp +++ b/src/lcals/PLANCKIAN-Seq.cpp @@ -19,7 +19,7 @@ namespace lcals { -void PLANCKIAN::runSeqVariant(VariantID vid) +void PLANCKIAN::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp index 84171c2b4..a41ff07bf 100644 --- a/src/lcals/PLANCKIAN.cpp +++ b/src/lcals/PLANCKIAN.cpp @@ -21,10 +21,6 @@ namespace lcals PLANCKIAN::PLANCKIAN(const RunParams& params) : KernelBase(rajaperf::Lcals_PLANCKIAN, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(50); @@ -59,7 +55,7 @@ PLANCKIAN::~PLANCKIAN() { } -void PLANCKIAN::setUp(VariantID vid) +void PLANCKIAN::setUp(VariantID vid, size_t /*tid*/) { allocAndInitData(m_x, getActualProblemSize(), vid); allocAndInitData(m_y, getActualProblemSize(), vid); @@ -68,12 +64,12 @@ void PLANCKIAN::setUp(VariantID vid) allocAndInitDataConst(m_w, getActualProblemSize(), 0.0, vid); } -void PLANCKIAN::updateChecksum(VariantID vid) +void PLANCKIAN::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_w, getActualProblemSize()); } -void PLANCKIAN::tearDown(VariantID vid) +void PLANCKIAN::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_x); diff --git a/src/lcals/PLANCKIAN.hpp b/src/lcals/PLANCKIAN.hpp index 3a7493a06..5220904f3 100644 --- a/src/lcals/PLANCKIAN.hpp +++ b/src/lcals/PLANCKIAN.hpp @@ -48,17 +48,18 @@ class PLANCKIAN : public KernelBase ~PLANCKIAN(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/lcals/TRIDIAG_ELIM-Cuda.cpp b/src/lcals/TRIDIAG_ELIM-Cuda.cpp index e3fc534b8..ed7000d3f 100644 --- a/src/lcals/TRIDIAG_ELIM-Cuda.cpp +++ b/src/lcals/TRIDIAG_ELIM-Cuda.cpp @@ -95,7 +95,7 @@ void TRIDIAG_ELIM::runCudaVariantImpl(VariantID vid) } } -void TRIDIAG_ELIM::runCudaVariant(VariantID vid) +void TRIDIAG_ELIM::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/lcals/TRIDIAG_ELIM-Hip.cpp b/src/lcals/TRIDIAG_ELIM-Hip.cpp index 4003ead5d..caeb51d8b 100644 --- a/src/lcals/TRIDIAG_ELIM-Hip.cpp +++ b/src/lcals/TRIDIAG_ELIM-Hip.cpp @@ -94,7 +94,7 @@ void TRIDIAG_ELIM::runHipVariantImpl(VariantID vid) } } -void TRIDIAG_ELIM::runHipVariant(VariantID vid) +void TRIDIAG_ELIM::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/lcals/TRIDIAG_ELIM-OMP.cpp b/src/lcals/TRIDIAG_ELIM-OMP.cpp index 0bd108fb1..1b8d8d8f5 100644 --- a/src/lcals/TRIDIAG_ELIM-OMP.cpp +++ b/src/lcals/TRIDIAG_ELIM-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void TRIDIAG_ELIM::runOpenMPVariant(VariantID vid) +void TRIDIAG_ELIM::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp b/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp index 39cb585d5..76949c4c4 100644 --- a/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp +++ b/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp @@ -43,7 +43,7 @@ namespace lcals deallocOpenMPDeviceData(z, did); -void TRIDIAG_ELIM::runOpenMPTargetVariant(VariantID vid) +void TRIDIAG_ELIM::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; diff --git a/src/lcals/TRIDIAG_ELIM-Seq.cpp b/src/lcals/TRIDIAG_ELIM-Seq.cpp index b3bf160ab..ac0a42157 100644 --- a/src/lcals/TRIDIAG_ELIM-Seq.cpp +++ b/src/lcals/TRIDIAG_ELIM-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void TRIDIAG_ELIM::runSeqVariant(VariantID vid) +void TRIDIAG_ELIM::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp index 9b146904e..71def13b2 100644 --- a/src/lcals/TRIDIAG_ELIM.cpp +++ b/src/lcals/TRIDIAG_ELIM.cpp @@ -21,10 +21,6 @@ namespace lcals TRIDIAG_ELIM::TRIDIAG_ELIM(const RunParams& params) : KernelBase(rajaperf::Lcals_TRIDIAG_ELIM, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(1000); @@ -61,7 +57,7 @@ TRIDIAG_ELIM::~TRIDIAG_ELIM() { } -void TRIDIAG_ELIM::setUp(VariantID vid) +void TRIDIAG_ELIM::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_xout, m_N, 0.0, vid); allocAndInitData(m_xin, m_N, vid); @@ -69,12 +65,12 @@ void TRIDIAG_ELIM::setUp(VariantID vid) allocAndInitData(m_z, m_N, vid); } -void TRIDIAG_ELIM::updateChecksum(VariantID vid) +void TRIDIAG_ELIM::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_xout, getActualProblemSize()); } -void TRIDIAG_ELIM::tearDown(VariantID vid) +void TRIDIAG_ELIM::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_xout); diff --git a/src/lcals/TRIDIAG_ELIM.hpp b/src/lcals/TRIDIAG_ELIM.hpp index 24a6524f7..63214fbb4 100644 --- a/src/lcals/TRIDIAG_ELIM.hpp +++ b/src/lcals/TRIDIAG_ELIM.hpp @@ -48,17 +48,18 @@ class TRIDIAG_ELIM : public KernelBase ~TRIDIAG_ELIM(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_2MM-Cuda.cpp b/src/polybench/POLYBENCH_2MM-Cuda.cpp index db3a497b3..000292232 100644 --- a/src/polybench/POLYBENCH_2MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_2MM-Cuda.cpp @@ -275,7 +275,7 @@ void POLYBENCH_2MM::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_2MM::runCudaVariant(VariantID vid) +void POLYBENCH_2MM::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_2MM-Hip.cpp b/src/polybench/POLYBENCH_2MM-Hip.cpp index 2e90c9bdd..eddc74251 100644 --- a/src/polybench/POLYBENCH_2MM-Hip.cpp +++ b/src/polybench/POLYBENCH_2MM-Hip.cpp @@ -278,7 +278,7 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_2MM::runHipVariant(VariantID vid) +void POLYBENCH_2MM::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_2MM-OMP.cpp b/src/polybench/POLYBENCH_2MM-OMP.cpp index a7778840a..33da0c025 100644 --- a/src/polybench/POLYBENCH_2MM-OMP.cpp +++ b/src/polybench/POLYBENCH_2MM-OMP.cpp @@ -26,7 +26,7 @@ namespace polybench { -void POLYBENCH_2MM::runOpenMPVariant(VariantID vid) +void POLYBENCH_2MM::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_2MM-OMPTarget.cpp b/src/polybench/POLYBENCH_2MM-OMPTarget.cpp index ce689e767..316698938 100644 --- a/src/polybench/POLYBENCH_2MM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_2MM-OMPTarget.cpp @@ -41,7 +41,7 @@ namespace polybench deallocOpenMPDeviceData(D, did); -void POLYBENCH_2MM::runOpenMPTargetVariant(VariantID vid) +void POLYBENCH_2MM::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_2MM-Seq.cpp b/src/polybench/POLYBENCH_2MM-Seq.cpp index 4eae8f13c..25ddae2c1 100644 --- a/src/polybench/POLYBENCH_2MM-Seq.cpp +++ b/src/polybench/POLYBENCH_2MM-Seq.cpp @@ -18,7 +18,7 @@ namespace rajaperf namespace polybench { -void POLYBENCH_2MM::runSeqVariant(VariantID vid) +void POLYBENCH_2MM::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index 12659b42a..59229fe80 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -22,10 +22,6 @@ namespace polybench POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params) : KernelBase(rajaperf::Polybench_2MM, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - Index_type ni_default = 1000; Index_type nj_default = 1000; Index_type nk_default = 1120; @@ -88,7 +84,7 @@ POLYBENCH_2MM::~POLYBENCH_2MM() { } -void POLYBENCH_2MM::setUp(VariantID vid) +void POLYBENCH_2MM::setUp(VariantID vid, size_t /*tid*/) { (void) vid; allocAndInitData(m_tmp, m_ni * m_nj, vid); @@ -98,12 +94,12 @@ void POLYBENCH_2MM::setUp(VariantID vid) allocAndInitDataConst(m_D, m_ni * m_nl, 0.0, vid); } -void POLYBENCH_2MM::updateChecksum(VariantID vid) +void POLYBENCH_2MM::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_D, m_ni * m_nl, checksum_scale_factor ); } -void POLYBENCH_2MM::tearDown(VariantID vid) +void POLYBENCH_2MM::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_tmp); diff --git a/src/polybench/POLYBENCH_2MM.hpp b/src/polybench/POLYBENCH_2MM.hpp index 6b2ae7c94..be3637824 100644 --- a/src/polybench/POLYBENCH_2MM.hpp +++ b/src/polybench/POLYBENCH_2MM.hpp @@ -118,17 +118,18 @@ class POLYBENCH_2MM : public KernelBase ~POLYBENCH_2MM(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_3MM-Cuda.cpp b/src/polybench/POLYBENCH_3MM-Cuda.cpp index 7eece54c2..24fc49484 100644 --- a/src/polybench/POLYBENCH_3MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_3MM-Cuda.cpp @@ -353,7 +353,7 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_3MM::runCudaVariant(VariantID vid) +void POLYBENCH_3MM::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_3MM-Hip.cpp b/src/polybench/POLYBENCH_3MM-Hip.cpp index fac0b5fe1..e8b6884aa 100644 --- a/src/polybench/POLYBENCH_3MM-Hip.cpp +++ b/src/polybench/POLYBENCH_3MM-Hip.cpp @@ -358,7 +358,7 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_3MM::runHipVariant(VariantID vid) +void POLYBENCH_3MM::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_3MM-OMP.cpp b/src/polybench/POLYBENCH_3MM-OMP.cpp index 7fe78a498..c3aaae9c7 100644 --- a/src/polybench/POLYBENCH_3MM-OMP.cpp +++ b/src/polybench/POLYBENCH_3MM-OMP.cpp @@ -27,7 +27,7 @@ namespace polybench { -void POLYBENCH_3MM::runOpenMPVariant(VariantID vid) +void POLYBENCH_3MM::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_3MM-OMPTarget.cpp b/src/polybench/POLYBENCH_3MM-OMPTarget.cpp index f7380fabd..ccfc41e86 100644 --- a/src/polybench/POLYBENCH_3MM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_3MM-OMPTarget.cpp @@ -44,7 +44,7 @@ namespace polybench deallocOpenMPDeviceData(F, did); \ deallocOpenMPDeviceData(G, did); -void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid) +void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_3MM-Seq.cpp b/src/polybench/POLYBENCH_3MM-Seq.cpp index 0659026cc..d93b30744 100644 --- a/src/polybench/POLYBENCH_3MM-Seq.cpp +++ b/src/polybench/POLYBENCH_3MM-Seq.cpp @@ -20,7 +20,7 @@ namespace polybench { -void POLYBENCH_3MM::runSeqVariant(VariantID vid) +void POLYBENCH_3MM::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp index b3e12c4a3..d7de76e38 100644 --- a/src/polybench/POLYBENCH_3MM.cpp +++ b/src/polybench/POLYBENCH_3MM.cpp @@ -23,10 +23,6 @@ namespace polybench POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params) : KernelBase(rajaperf::Polybench_3MM, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - Index_type ni_default = 1000; Index_type nj_default = 1000; Index_type nk_default = 1010; @@ -96,7 +92,7 @@ POLYBENCH_3MM::~POLYBENCH_3MM() { } -void POLYBENCH_3MM::setUp(VariantID vid) +void POLYBENCH_3MM::setUp(VariantID vid, size_t /*tid*/) { (void) vid; allocAndInitData(m_A, m_ni * m_nk, vid); @@ -108,12 +104,12 @@ void POLYBENCH_3MM::setUp(VariantID vid) allocAndInitDataConst(m_G, m_ni * m_nl, 0.0, vid); } -void POLYBENCH_3MM::updateChecksum(VariantID vid) +void POLYBENCH_3MM::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_G, m_ni * m_nl, checksum_scale_factor ); } -void POLYBENCH_3MM::tearDown(VariantID vid) +void POLYBENCH_3MM::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_A); diff --git a/src/polybench/POLYBENCH_3MM.hpp b/src/polybench/POLYBENCH_3MM.hpp index 393c24225..0e8030115 100644 --- a/src/polybench/POLYBENCH_3MM.hpp +++ b/src/polybench/POLYBENCH_3MM.hpp @@ -144,17 +144,18 @@ class POLYBENCH_3MM : public KernelBase ~POLYBENCH_3MM(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_ADI-Cuda.cpp b/src/polybench/POLYBENCH_ADI-Cuda.cpp index 392a77c28..3a76e6429 100644 --- a/src/polybench/POLYBENCH_ADI-Cuda.cpp +++ b/src/polybench/POLYBENCH_ADI-Cuda.cpp @@ -248,7 +248,7 @@ void POLYBENCH_ADI::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_ADI::runCudaVariant(VariantID vid) +void POLYBENCH_ADI::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_ADI-Hip.cpp b/src/polybench/POLYBENCH_ADI-Hip.cpp index 6d7d1ed1d..d656dc93c 100644 --- a/src/polybench/POLYBENCH_ADI-Hip.cpp +++ b/src/polybench/POLYBENCH_ADI-Hip.cpp @@ -257,7 +257,7 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_ADI::runHipVariant(VariantID vid) +void POLYBENCH_ADI::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_ADI-OMP.cpp b/src/polybench/POLYBENCH_ADI-OMP.cpp index 42edab15b..ee80cadb2 100644 --- a/src/polybench/POLYBENCH_ADI-OMP.cpp +++ b/src/polybench/POLYBENCH_ADI-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_ADI::runOpenMPVariant(VariantID vid) +void POLYBENCH_ADI::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_ADI-OMPTarget.cpp b/src/polybench/POLYBENCH_ADI-OMPTarget.cpp index 0d04bb597..b09578d8d 100644 --- a/src/polybench/POLYBENCH_ADI-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_ADI-OMPTarget.cpp @@ -43,7 +43,7 @@ namespace polybench deallocOpenMPDeviceData(Q, did); -void POLYBENCH_ADI::runOpenMPTargetVariant(VariantID vid) +void POLYBENCH_ADI::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_ADI-Seq.cpp b/src/polybench/POLYBENCH_ADI-Seq.cpp index b4f8e82c3..e9c3bc5d9 100644 --- a/src/polybench/POLYBENCH_ADI-Seq.cpp +++ b/src/polybench/POLYBENCH_ADI-Seq.cpp @@ -18,7 +18,7 @@ namespace rajaperf namespace polybench { -void POLYBENCH_ADI::runSeqVariant(VariantID vid) +void POLYBENCH_ADI::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp index 27a82550a..28bf02f83 100644 --- a/src/polybench/POLYBENCH_ADI.cpp +++ b/src/polybench/POLYBENCH_ADI.cpp @@ -20,10 +20,6 @@ namespace polybench POLYBENCH_ADI::POLYBENCH_ADI(const RunParams& params) : KernelBase(rajaperf::Polybench_ADI, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - Index_type n_default = 1000; setDefaultProblemSize( (n_default-2) * (n_default-2) ); @@ -73,7 +69,7 @@ POLYBENCH_ADI::~POLYBENCH_ADI() { } -void POLYBENCH_ADI::setUp(VariantID vid) +void POLYBENCH_ADI::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_U, m_n * m_n, 0.0, vid); allocAndInitData(m_V, m_n * m_n, vid); @@ -81,12 +77,12 @@ void POLYBENCH_ADI::setUp(VariantID vid) allocAndInitData(m_Q, m_n * m_n, vid); } -void POLYBENCH_ADI::updateChecksum(VariantID vid) +void POLYBENCH_ADI::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_U, m_n * m_n, checksum_scale_factor ); } -void POLYBENCH_ADI::tearDown(VariantID vid) +void POLYBENCH_ADI::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_U); diff --git a/src/polybench/POLYBENCH_ADI.hpp b/src/polybench/POLYBENCH_ADI.hpp index 4e75fd44a..ed676a7bd 100644 --- a/src/polybench/POLYBENCH_ADI.hpp +++ b/src/polybench/POLYBENCH_ADI.hpp @@ -186,17 +186,18 @@ class POLYBENCH_ADI : public KernelBase ~POLYBENCH_ADI(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_ATAX-Cuda.cpp b/src/polybench/POLYBENCH_ATAX-Cuda.cpp index c94e70484..7ae84e146 100644 --- a/src/polybench/POLYBENCH_ATAX-Cuda.cpp +++ b/src/polybench/POLYBENCH_ATAX-Cuda.cpp @@ -230,7 +230,7 @@ void POLYBENCH_ATAX::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_ATAX::runCudaVariant(VariantID vid) +void POLYBENCH_ATAX::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_ATAX-Hip.cpp b/src/polybench/POLYBENCH_ATAX-Hip.cpp index 37204b2d6..05ff793e3 100644 --- a/src/polybench/POLYBENCH_ATAX-Hip.cpp +++ b/src/polybench/POLYBENCH_ATAX-Hip.cpp @@ -237,7 +237,7 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_ATAX::runHipVariant(VariantID vid) +void POLYBENCH_ATAX::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_ATAX-OMP.cpp b/src/polybench/POLYBENCH_ATAX-OMP.cpp index cb78dd1cc..f0ceff624 100644 --- a/src/polybench/POLYBENCH_ATAX-OMP.cpp +++ b/src/polybench/POLYBENCH_ATAX-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_ATAX::runOpenMPVariant(VariantID vid) +void POLYBENCH_ATAX::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp b/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp index 7f9b96a75..859d27699 100644 --- a/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp @@ -44,7 +44,7 @@ namespace polybench deallocOpenMPDeviceData(A, did); -void POLYBENCH_ATAX::runOpenMPTargetVariant(VariantID vid) +void POLYBENCH_ATAX::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_ATAX-Seq.cpp b/src/polybench/POLYBENCH_ATAX-Seq.cpp index 5f6d018b6..852af0f58 100644 --- a/src/polybench/POLYBENCH_ATAX-Seq.cpp +++ b/src/polybench/POLYBENCH_ATAX-Seq.cpp @@ -18,7 +18,7 @@ namespace rajaperf namespace polybench { -void POLYBENCH_ATAX::runSeqVariant(VariantID vid) +void POLYBENCH_ATAX::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp index 525457d5e..c5a244b3f 100644 --- a/src/polybench/POLYBENCH_ATAX.cpp +++ b/src/polybench/POLYBENCH_ATAX.cpp @@ -21,10 +21,6 @@ namespace polybench POLYBENCH_ATAX::POLYBENCH_ATAX(const RunParams& params) : KernelBase(rajaperf::Polybench_ATAX, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - Index_type N_default = 1000; setDefaultProblemSize( N_default * N_default ); @@ -75,7 +71,7 @@ POLYBENCH_ATAX::~POLYBENCH_ATAX() { } -void POLYBENCH_ATAX::setUp(VariantID vid) +void POLYBENCH_ATAX::setUp(VariantID vid, size_t /*tid*/) { (void) vid; allocAndInitData(m_tmp, m_N, vid); @@ -84,12 +80,12 @@ void POLYBENCH_ATAX::setUp(VariantID vid) allocAndInitDataConst(m_y, m_N, 0.0, vid); } -void POLYBENCH_ATAX::updateChecksum(VariantID vid) +void POLYBENCH_ATAX::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_y, m_N, checksum_scale_factor ); } -void POLYBENCH_ATAX::tearDown(VariantID vid) +void POLYBENCH_ATAX::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_tmp); diff --git a/src/polybench/POLYBENCH_ATAX.hpp b/src/polybench/POLYBENCH_ATAX.hpp index 3470746c4..0448ba805 100644 --- a/src/polybench/POLYBENCH_ATAX.hpp +++ b/src/polybench/POLYBENCH_ATAX.hpp @@ -106,17 +106,18 @@ class POLYBENCH_ATAX : public KernelBase ~POLYBENCH_ATAX(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp index 13b0b9aec..e6e07f5c8 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp @@ -320,7 +320,7 @@ void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_FDTD_2D::runCudaVariant(VariantID vid) +void POLYBENCH_FDTD_2D::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp index e95b44126..bee3ea28f 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp @@ -332,7 +332,7 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_FDTD_2D::runHipVariant(VariantID vid) +void POLYBENCH_FDTD_2D::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp b/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp index ebd027d6f..1590a8f3e 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp @@ -18,7 +18,7 @@ namespace polybench { -void POLYBENCH_FDTD_2D::runOpenMPVariant(VariantID vid) +void POLYBENCH_FDTD_2D::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp b/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp index d260754bf..6ee53a136 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp @@ -43,7 +43,7 @@ namespace polybench deallocOpenMPDeviceData(fict, did); -void POLYBENCH_FDTD_2D::runOpenMPTargetVariant(VariantID vid) +void POLYBENCH_FDTD_2D::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp b/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp index 58461dd8a..3873309c7 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp @@ -18,7 +18,7 @@ namespace polybench { -void POLYBENCH_FDTD_2D::runSeqVariant(VariantID vid) +void POLYBENCH_FDTD_2D::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp index 0b0c86747..7fb9161db 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D.cpp @@ -24,10 +24,6 @@ namespace polybench POLYBENCH_FDTD_2D::POLYBENCH_FDTD_2D(const RunParams& params) : KernelBase(rajaperf::Polybench_FDTD_2D, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - Index_type nx_default = 1000; Index_type ny_default = 1000; @@ -94,7 +90,7 @@ POLYBENCH_FDTD_2D::~POLYBENCH_FDTD_2D() { } -void POLYBENCH_FDTD_2D::setUp(VariantID vid) +void POLYBENCH_FDTD_2D::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_hz, m_nx * m_ny, 0.0, vid); allocAndInitData(m_ex, m_nx * m_ny, vid); @@ -102,12 +98,12 @@ void POLYBENCH_FDTD_2D::setUp(VariantID vid) allocAndInitData(m_fict, m_tsteps, vid); } -void POLYBENCH_FDTD_2D::updateChecksum(VariantID vid) +void POLYBENCH_FDTD_2D::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_hz, m_nx * m_ny, checksum_scale_factor); } -void POLYBENCH_FDTD_2D::tearDown(VariantID vid) +void POLYBENCH_FDTD_2D::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_fict); diff --git a/src/polybench/POLYBENCH_FDTD_2D.hpp b/src/polybench/POLYBENCH_FDTD_2D.hpp index 10d637690..cb819d62e 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.hpp +++ b/src/polybench/POLYBENCH_FDTD_2D.hpp @@ -104,17 +104,18 @@ class POLYBENCH_FDTD_2D : public KernelBase ~POLYBENCH_FDTD_2D(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp index 7fd5e2f7c..ab84e9d47 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp @@ -179,7 +179,7 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_FLOYD_WARSHALL::runCudaVariant(VariantID vid) +void POLYBENCH_FLOYD_WARSHALL::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp index 60169ece9..aa6832bed 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp @@ -183,7 +183,7 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_FLOYD_WARSHALL::runHipVariant(VariantID vid) +void POLYBENCH_FLOYD_WARSHALL::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp index f3cfc0466..2e433f9ab 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp @@ -24,7 +24,7 @@ namespace polybench { -void POLYBENCH_FLOYD_WARSHALL::runOpenMPVariant(VariantID vid) +void POLYBENCH_FLOYD_WARSHALL::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp index c2e864b93..bef376b4d 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp @@ -35,7 +35,7 @@ namespace polybench deallocOpenMPDeviceData(pout, did); -void POLYBENCH_FLOYD_WARSHALL::runOpenMPTargetVariant(VariantID vid) +void POLYBENCH_FLOYD_WARSHALL::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp index cfe5ef88e..c72b0cb17 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp @@ -18,7 +18,7 @@ namespace polybench { -void POLYBENCH_FLOYD_WARSHALL::runSeqVariant(VariantID vid) +void POLYBENCH_FLOYD_WARSHALL::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp index e0cdd45bd..4148cbcae 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp @@ -21,10 +21,6 @@ namespace polybench POLYBENCH_FLOYD_WARSHALL::POLYBENCH_FLOYD_WARSHALL(const RunParams& params) : KernelBase(rajaperf::Polybench_FLOYD_WARSHALL, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - Index_type N_default = 1000; setDefaultProblemSize( N_default * N_default ); @@ -70,19 +66,19 @@ POLYBENCH_FLOYD_WARSHALL::~POLYBENCH_FLOYD_WARSHALL() { } -void POLYBENCH_FLOYD_WARSHALL::setUp(VariantID vid) +void POLYBENCH_FLOYD_WARSHALL::setUp(VariantID vid, size_t /*tid*/) { (void) vid; allocAndInitDataRandSign(m_pin, m_N*m_N, vid); allocAndInitDataConst(m_pout, m_N*m_N, 0.0, vid); } -void POLYBENCH_FLOYD_WARSHALL::updateChecksum(VariantID vid) +void POLYBENCH_FLOYD_WARSHALL::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_pout, m_N*m_N, checksum_scale_factor ); } -void POLYBENCH_FLOYD_WARSHALL::tearDown(VariantID vid) +void POLYBENCH_FLOYD_WARSHALL::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_pin); diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp index 7c4c06d81..c1ba36424 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp @@ -67,17 +67,18 @@ class POLYBENCH_FLOYD_WARSHALL : public KernelBase ~POLYBENCH_FLOYD_WARSHALL(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_GEMM-Cuda.cpp b/src/polybench/POLYBENCH_GEMM-Cuda.cpp index 91c415184..5e2955f3f 100644 --- a/src/polybench/POLYBENCH_GEMM-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMM-Cuda.cpp @@ -205,7 +205,7 @@ void POLYBENCH_GEMM::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_GEMM::runCudaVariant(VariantID vid) +void POLYBENCH_GEMM::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_GEMM-Hip.cpp b/src/polybench/POLYBENCH_GEMM-Hip.cpp index 6396856eb..de096b702 100644 --- a/src/polybench/POLYBENCH_GEMM-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMM-Hip.cpp @@ -206,7 +206,7 @@ void POLYBENCH_GEMM::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_GEMM::runHipVariant(VariantID vid) +void POLYBENCH_GEMM::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_GEMM-OMP.cpp b/src/polybench/POLYBENCH_GEMM-OMP.cpp index 9195af832..4703c566b 100644 --- a/src/polybench/POLYBENCH_GEMM-OMP.cpp +++ b/src/polybench/POLYBENCH_GEMM-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_GEMM::runOpenMPVariant(VariantID vid) +void POLYBENCH_GEMM::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp b/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp index 0570e3ad3..caa2f8b9e 100644 --- a/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp @@ -37,7 +37,7 @@ namespace polybench deallocOpenMPDeviceData(C, did); -void POLYBENCH_GEMM::runOpenMPTargetVariant(VariantID vid) +void POLYBENCH_GEMM::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_GEMM-Seq.cpp b/src/polybench/POLYBENCH_GEMM-Seq.cpp index e28973b2b..1937743c8 100644 --- a/src/polybench/POLYBENCH_GEMM-Seq.cpp +++ b/src/polybench/POLYBENCH_GEMM-Seq.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_GEMM::runSeqVariant(VariantID vid) +void POLYBENCH_GEMM::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp index 8fc1ebc4c..b48addd93 100644 --- a/src/polybench/POLYBENCH_GEMM.cpp +++ b/src/polybench/POLYBENCH_GEMM.cpp @@ -21,10 +21,6 @@ namespace polybench POLYBENCH_GEMM::POLYBENCH_GEMM(const RunParams& params) : KernelBase(rajaperf::Polybench_GEMM, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - Index_type ni_default = 1000; Index_type nj_default = 1000; Index_type nk_default = 1200; @@ -80,7 +76,7 @@ POLYBENCH_GEMM::~POLYBENCH_GEMM() { } -void POLYBENCH_GEMM::setUp(VariantID vid) +void POLYBENCH_GEMM::setUp(VariantID vid, size_t /*tid*/) { (void) vid; allocAndInitData(m_A, m_ni * m_nk, vid); @@ -88,12 +84,12 @@ void POLYBENCH_GEMM::setUp(VariantID vid) allocAndInitDataConst(m_C, m_ni * m_nj, 0.0, vid); } -void POLYBENCH_GEMM::updateChecksum(VariantID vid) +void POLYBENCH_GEMM::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_C, m_ni * m_nj, checksum_scale_factor ); } -void POLYBENCH_GEMM::tearDown(VariantID vid) +void POLYBENCH_GEMM::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_A); diff --git a/src/polybench/POLYBENCH_GEMM.hpp b/src/polybench/POLYBENCH_GEMM.hpp index 0348a5db0..95b51087d 100644 --- a/src/polybench/POLYBENCH_GEMM.hpp +++ b/src/polybench/POLYBENCH_GEMM.hpp @@ -90,17 +90,18 @@ class POLYBENCH_GEMM : public KernelBase ~POLYBENCH_GEMM(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp index 01049b4ac..64005cecb 100644 --- a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp @@ -342,7 +342,7 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_GEMVER::runCudaVariant(VariantID vid) +void POLYBENCH_GEMVER::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_GEMVER-Hip.cpp b/src/polybench/POLYBENCH_GEMVER-Hip.cpp index b8bfbab30..be98ed54e 100644 --- a/src/polybench/POLYBENCH_GEMVER-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Hip.cpp @@ -350,7 +350,7 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_GEMVER::runHipVariant(VariantID vid) +void POLYBENCH_GEMVER::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_GEMVER-OMP.cpp b/src/polybench/POLYBENCH_GEMVER-OMP.cpp index 3b5d911b9..b8b068035 100644 --- a/src/polybench/POLYBENCH_GEMVER-OMP.cpp +++ b/src/polybench/POLYBENCH_GEMVER-OMP.cpp @@ -20,7 +20,7 @@ namespace polybench { -void POLYBENCH_GEMVER::runOpenMPVariant(VariantID vid) +void POLYBENCH_GEMVER::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp b/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp index 5b256729e..9e33735b1 100644 --- a/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp @@ -54,7 +54,7 @@ namespace polybench -void POLYBENCH_GEMVER::runOpenMPTargetVariant(VariantID vid) +void POLYBENCH_GEMVER::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_GEMVER-Seq.cpp b/src/polybench/POLYBENCH_GEMVER-Seq.cpp index 096dd9d56..afc562923 100644 --- a/src/polybench/POLYBENCH_GEMVER-Seq.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Seq.cpp @@ -20,7 +20,7 @@ namespace polybench { -void POLYBENCH_GEMVER::runSeqVariant(VariantID vid) +void POLYBENCH_GEMVER::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp index cdcc003c8..dea7ab038 100644 --- a/src/polybench/POLYBENCH_GEMVER.cpp +++ b/src/polybench/POLYBENCH_GEMVER.cpp @@ -21,10 +21,6 @@ namespace polybench POLYBENCH_GEMVER::POLYBENCH_GEMVER(const RunParams& params) : KernelBase(rajaperf::Polybench_GEMVER, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - Index_type n_default = 1000; setDefaultProblemSize( n_default * n_default ); @@ -89,7 +85,7 @@ POLYBENCH_GEMVER::~POLYBENCH_GEMVER() { } -void POLYBENCH_GEMVER::setUp(VariantID vid) +void POLYBENCH_GEMVER::setUp(VariantID vid, size_t /*tid*/) { (void) vid; @@ -104,12 +100,12 @@ void POLYBENCH_GEMVER::setUp(VariantID vid) allocAndInitData(m_z, m_n, vid); } -void POLYBENCH_GEMVER::updateChecksum(VariantID vid) +void POLYBENCH_GEMVER::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_w, m_n, checksum_scale_factor ); } -void POLYBENCH_GEMVER::tearDown(VariantID vid) +void POLYBENCH_GEMVER::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_A); diff --git a/src/polybench/POLYBENCH_GEMVER.hpp b/src/polybench/POLYBENCH_GEMVER.hpp index 6388d7273..e967d1783 100644 --- a/src/polybench/POLYBENCH_GEMVER.hpp +++ b/src/polybench/POLYBENCH_GEMVER.hpp @@ -143,17 +143,18 @@ class POLYBENCH_GEMVER : public KernelBase ~POLYBENCH_GEMVER(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp index 29b530864..38093f14d 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp @@ -137,7 +137,7 @@ void POLYBENCH_GESUMMV::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_GESUMMV::runCudaVariant(VariantID vid) +void POLYBENCH_GESUMMV::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp index a234a5799..7627d1091 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp @@ -139,7 +139,7 @@ void POLYBENCH_GESUMMV::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_GESUMMV::runHipVariant(VariantID vid) +void POLYBENCH_GESUMMV::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_GESUMMV-OMP.cpp b/src/polybench/POLYBENCH_GESUMMV-OMP.cpp index 4fc4896ed..342e831ac 100644 --- a/src/polybench/POLYBENCH_GESUMMV-OMP.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_GESUMMV::runOpenMPVariant(VariantID vid) +void POLYBENCH_GESUMMV::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp b/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp index 299d4b347..0405d8be4 100644 --- a/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp @@ -44,7 +44,7 @@ namespace polybench deallocOpenMPDeviceData(B, did); -void POLYBENCH_GESUMMV::runOpenMPTargetVariant(VariantID vid) +void POLYBENCH_GESUMMV::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_GESUMMV-Seq.cpp b/src/polybench/POLYBENCH_GESUMMV-Seq.cpp index 4a488029b..2323e396b 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Seq.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Seq.cpp @@ -18,7 +18,7 @@ namespace rajaperf namespace polybench { -void POLYBENCH_GESUMMV::runSeqVariant(VariantID vid) +void POLYBENCH_GESUMMV::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp index 5aace89ce..96d688afa 100644 --- a/src/polybench/POLYBENCH_GESUMMV.cpp +++ b/src/polybench/POLYBENCH_GESUMMV.cpp @@ -21,10 +21,6 @@ namespace polybench POLYBENCH_GESUMMV::POLYBENCH_GESUMMV(const RunParams& params) : KernelBase(rajaperf::Polybench_GESUMMV, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - Index_type N_default = 1000; setDefaultProblemSize( N_default * N_default ); @@ -69,7 +65,7 @@ POLYBENCH_GESUMMV::~POLYBENCH_GESUMMV() { } -void POLYBENCH_GESUMMV::setUp(VariantID vid) +void POLYBENCH_GESUMMV::setUp(VariantID vid, size_t /*tid*/) { (void) vid; allocAndInitData(m_x, m_N, vid); @@ -78,12 +74,12 @@ void POLYBENCH_GESUMMV::setUp(VariantID vid) allocAndInitData(m_B, m_N * m_N, vid); } -void POLYBENCH_GESUMMV::updateChecksum(VariantID vid) +void POLYBENCH_GESUMMV::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_y, m_N); } -void POLYBENCH_GESUMMV::tearDown(VariantID vid) +void POLYBENCH_GESUMMV::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_x); diff --git a/src/polybench/POLYBENCH_GESUMMV.hpp b/src/polybench/POLYBENCH_GESUMMV.hpp index 196572ff3..f87c41c94 100644 --- a/src/polybench/POLYBENCH_GESUMMV.hpp +++ b/src/polybench/POLYBENCH_GESUMMV.hpp @@ -89,17 +89,18 @@ class POLYBENCH_GESUMMV : public KernelBase ~POLYBENCH_GESUMMV(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp index ccca6dfbd..236738bba 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp @@ -221,7 +221,7 @@ void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_HEAT_3D::runCudaVariant(VariantID vid) +void POLYBENCH_HEAT_3D::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp index d5a21e533..070aab354 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp @@ -227,7 +227,7 @@ void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_HEAT_3D::runHipVariant(VariantID vid) +void POLYBENCH_HEAT_3D::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp b/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp index 7bf354c65..e702a7e6b 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_HEAT_3D::runOpenMPVariant(VariantID vid) +void POLYBENCH_HEAT_3D::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp b/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp index bc6fe97aa..8150f13ae 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp @@ -36,7 +36,7 @@ namespace polybench deallocOpenMPDeviceData(B, did); -void POLYBENCH_HEAT_3D::runOpenMPTargetVariant(VariantID vid) +void POLYBENCH_HEAT_3D::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp b/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp index 2a93d39b8..ab920ea65 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_HEAT_3D::runSeqVariant(VariantID vid) +void POLYBENCH_HEAT_3D::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp index 4d74c5b5c..d5e4b579c 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D.cpp @@ -22,10 +22,6 @@ namespace polybench POLYBENCH_HEAT_3D::POLYBENCH_HEAT_3D(const RunParams& params) : KernelBase(rajaperf::Polybench_HEAT_3D, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - Index_type N_default = 100; setDefaultProblemSize( (N_default-2)*(N_default-2)*(N_default-2) ); @@ -80,7 +76,7 @@ POLYBENCH_HEAT_3D::~POLYBENCH_HEAT_3D() { } -void POLYBENCH_HEAT_3D::setUp(VariantID vid) +void POLYBENCH_HEAT_3D::setUp(VariantID vid, size_t /*tid*/) { (void) vid; allocAndInitData(m_Ainit, m_N*m_N*m_N, vid); @@ -89,13 +85,13 @@ void POLYBENCH_HEAT_3D::setUp(VariantID vid) allocAndInitDataConst(m_B, m_N*m_N*m_N, 0.0, vid); } -void POLYBENCH_HEAT_3D::updateChecksum(VariantID vid) +void POLYBENCH_HEAT_3D::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_A, m_N*m_N*m_N, checksum_scale_factor ); checksum[vid] += calcChecksum(m_B, m_N*m_N*m_N, checksum_scale_factor ); } -void POLYBENCH_HEAT_3D::tearDown(VariantID vid) +void POLYBENCH_HEAT_3D::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_A); diff --git a/src/polybench/POLYBENCH_HEAT_3D.hpp b/src/polybench/POLYBENCH_HEAT_3D.hpp index 06592b5a4..455f47bb0 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.hpp +++ b/src/polybench/POLYBENCH_HEAT_3D.hpp @@ -115,17 +115,18 @@ class POLYBENCH_HEAT_3D : public KernelBase ~POLYBENCH_HEAT_3D(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp index defabb3d6..cec48a1ec 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp @@ -120,7 +120,7 @@ void POLYBENCH_JACOBI_1D::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_JACOBI_1D::runCudaVariant(VariantID vid) +void POLYBENCH_JACOBI_1D::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp index e92192268..bb0e8aa9c 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp @@ -122,7 +122,7 @@ void POLYBENCH_JACOBI_1D::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_JACOBI_1D::runHipVariant(VariantID vid) +void POLYBENCH_JACOBI_1D::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp b/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp index 683cea5ba..fcb0931a2 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_JACOBI_1D::runOpenMPVariant(VariantID vid) +void POLYBENCH_JACOBI_1D::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp b/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp index d281ff310..c48105884 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp @@ -41,7 +41,7 @@ namespace polybench deallocOpenMPDeviceData(B, did); -void POLYBENCH_JACOBI_1D::runOpenMPTargetVariant(VariantID vid) +void POLYBENCH_JACOBI_1D::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp index a022c2981..446abce00 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_JACOBI_1D::runSeqVariant(VariantID vid) +void POLYBENCH_JACOBI_1D::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp index 1a4b5fbe4..c181a7b20 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp @@ -21,10 +21,6 @@ namespace polybench POLYBENCH_JACOBI_1D::POLYBENCH_JACOBI_1D(const RunParams& params) : KernelBase(rajaperf::Polybench_JACOBI_1D, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - Index_type N_default = 1000000; setDefaultProblemSize( N_default-2 ); @@ -77,7 +73,7 @@ POLYBENCH_JACOBI_1D::~POLYBENCH_JACOBI_1D() { } -void POLYBENCH_JACOBI_1D::setUp(VariantID vid) +void POLYBENCH_JACOBI_1D::setUp(VariantID vid, size_t /*tid*/) { (void) vid; allocAndInitData(m_Ainit, m_N, vid); @@ -86,13 +82,13 @@ void POLYBENCH_JACOBI_1D::setUp(VariantID vid) allocAndInitDataConst(m_B, m_N, 0.0, vid); } -void POLYBENCH_JACOBI_1D::updateChecksum(VariantID vid) +void POLYBENCH_JACOBI_1D::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_A, m_N, checksum_scale_factor ); checksum[vid] += calcChecksum(m_B, m_N, checksum_scale_factor ); } -void POLYBENCH_JACOBI_1D::tearDown(VariantID vid) +void POLYBENCH_JACOBI_1D::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_A); diff --git a/src/polybench/POLYBENCH_JACOBI_1D.hpp b/src/polybench/POLYBENCH_JACOBI_1D.hpp index 7ff522400..5d28383b9 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.hpp @@ -61,17 +61,18 @@ class POLYBENCH_JACOBI_1D : public KernelBase ~POLYBENCH_JACOBI_1D(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp index 861deff2a..47697964f 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp @@ -211,7 +211,7 @@ void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_JACOBI_2D::runCudaVariant(VariantID vid) +void POLYBENCH_JACOBI_2D::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp index 369494a30..fadd87e04 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp @@ -217,7 +217,7 @@ void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_JACOBI_2D::runHipVariant(VariantID vid) +void POLYBENCH_JACOBI_2D::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp b/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp index 69e6e1d13..2d1f18eeb 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_JACOBI_2D::runOpenMPVariant(VariantID vid) +void POLYBENCH_JACOBI_2D::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp b/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp index 36afb8957..8b3969487 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp @@ -36,7 +36,7 @@ namespace polybench deallocOpenMPDeviceData(B, did); -void POLYBENCH_JACOBI_2D::runOpenMPTargetVariant(VariantID vid) +void POLYBENCH_JACOBI_2D::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp index 87e8e8e15..f61036cf9 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_JACOBI_2D::runSeqVariant(VariantID vid) +void POLYBENCH_JACOBI_2D::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp index 68c6a348c..97f910ab1 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp @@ -21,10 +21,6 @@ namespace polybench POLYBENCH_JACOBI_2D::POLYBENCH_JACOBI_2D(const RunParams& params) : KernelBase(rajaperf::Polybench_JACOBI_2D, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - Index_type N_default = 1000; setDefaultProblemSize( N_default * N_default ); @@ -79,7 +75,7 @@ POLYBENCH_JACOBI_2D::~POLYBENCH_JACOBI_2D() { } -void POLYBENCH_JACOBI_2D::setUp(VariantID vid) +void POLYBENCH_JACOBI_2D::setUp(VariantID vid, size_t /*tid*/) { (void) vid; allocAndInitData(m_Ainit, m_N*m_N, vid); @@ -88,13 +84,13 @@ void POLYBENCH_JACOBI_2D::setUp(VariantID vid) allocAndInitDataConst(m_B, m_N*m_N, 0.0, vid); } -void POLYBENCH_JACOBI_2D::updateChecksum(VariantID vid) +void POLYBENCH_JACOBI_2D::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_A, m_N*m_N, checksum_scale_factor ); checksum[vid] += calcChecksum(m_B, m_N*m_N, checksum_scale_factor ); } -void POLYBENCH_JACOBI_2D::tearDown(VariantID vid) +void POLYBENCH_JACOBI_2D::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_A); diff --git a/src/polybench/POLYBENCH_JACOBI_2D.hpp b/src/polybench/POLYBENCH_JACOBI_2D.hpp index 7bbc2dbc6..db996beb2 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.hpp @@ -81,17 +81,18 @@ class POLYBENCH_JACOBI_2D : public KernelBase ~POLYBENCH_JACOBI_2D(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/polybench/POLYBENCH_MVT-Cuda.cpp b/src/polybench/POLYBENCH_MVT-Cuda.cpp index 53f4c771d..d22500cf2 100644 --- a/src/polybench/POLYBENCH_MVT-Cuda.cpp +++ b/src/polybench/POLYBENCH_MVT-Cuda.cpp @@ -178,7 +178,7 @@ void POLYBENCH_MVT::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_MVT::runCudaVariant(VariantID vid) +void POLYBENCH_MVT::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_MVT-Hip.cpp b/src/polybench/POLYBENCH_MVT-Hip.cpp index e28b67f81..2cb5135a1 100644 --- a/src/polybench/POLYBENCH_MVT-Hip.cpp +++ b/src/polybench/POLYBENCH_MVT-Hip.cpp @@ -176,7 +176,7 @@ void POLYBENCH_MVT::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_MVT::runHipVariant(VariantID vid) +void POLYBENCH_MVT::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/polybench/POLYBENCH_MVT-OMP.cpp b/src/polybench/POLYBENCH_MVT-OMP.cpp index 0023d0684..d030f3a84 100644 --- a/src/polybench/POLYBENCH_MVT-OMP.cpp +++ b/src/polybench/POLYBENCH_MVT-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_MVT::runOpenMPVariant(VariantID vid) +void POLYBENCH_MVT::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_MVT-OMPTarget.cpp b/src/polybench/POLYBENCH_MVT-OMPTarget.cpp index 8eb198ea5..9e1f01988 100644 --- a/src/polybench/POLYBENCH_MVT-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_MVT-OMPTarget.cpp @@ -47,7 +47,7 @@ namespace polybench deallocOpenMPDeviceData(A, did); -void POLYBENCH_MVT::runOpenMPTargetVariant(VariantID vid) +void POLYBENCH_MVT::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_MVT-Seq.cpp b/src/polybench/POLYBENCH_MVT-Seq.cpp index 80847c383..fd9499c33 100644 --- a/src/polybench/POLYBENCH_MVT-Seq.cpp +++ b/src/polybench/POLYBENCH_MVT-Seq.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_MVT::runSeqVariant(VariantID vid) +void POLYBENCH_MVT::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp index de0b28a2b..245dfd028 100644 --- a/src/polybench/POLYBENCH_MVT.cpp +++ b/src/polybench/POLYBENCH_MVT.cpp @@ -21,10 +21,6 @@ namespace polybench POLYBENCH_MVT::POLYBENCH_MVT(const RunParams& params) : KernelBase(rajaperf::Polybench_MVT, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - Index_type N_default = 1000; setDefaultProblemSize( N_default * N_default ); @@ -72,7 +68,7 @@ POLYBENCH_MVT::~POLYBENCH_MVT() { } -void POLYBENCH_MVT::setUp(VariantID vid) +void POLYBENCH_MVT::setUp(VariantID vid, size_t /*tid*/) { (void) vid; allocAndInitData(m_y1, m_N, vid); @@ -82,13 +78,13 @@ void POLYBENCH_MVT::setUp(VariantID vid) allocAndInitDataConst(m_x2, m_N, 0.0, vid); } -void POLYBENCH_MVT::updateChecksum(VariantID vid) +void POLYBENCH_MVT::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_x1, m_N, checksum_scale_factor ); checksum[vid] += calcChecksum(m_x2, m_N, checksum_scale_factor ); } -void POLYBENCH_MVT::tearDown(VariantID vid) +void POLYBENCH_MVT::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_x1); diff --git a/src/polybench/POLYBENCH_MVT.hpp b/src/polybench/POLYBENCH_MVT.hpp index b72b9b9f4..b1aaaad1d 100644 --- a/src/polybench/POLYBENCH_MVT.hpp +++ b/src/polybench/POLYBENCH_MVT.hpp @@ -103,17 +103,18 @@ class POLYBENCH_MVT : public KernelBase ~POLYBENCH_MVT(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/stream/ADD-Cuda.cpp b/src/stream/ADD-Cuda.cpp index 31800d981..0875ee939 100644 --- a/src/stream/ADD-Cuda.cpp +++ b/src/stream/ADD-Cuda.cpp @@ -111,7 +111,7 @@ void ADD::runCudaVariantImpl(VariantID vid) } } -void ADD::runCudaVariant(VariantID vid) +void ADD::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/stream/ADD-Hip.cpp b/src/stream/ADD-Hip.cpp index 5b5d125ca..12ffe4863 100644 --- a/src/stream/ADD-Hip.cpp +++ b/src/stream/ADD-Hip.cpp @@ -113,7 +113,7 @@ void ADD::runHipVariantImpl(VariantID vid) } } -void ADD::runHipVariant(VariantID vid) +void ADD::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/stream/ADD-OMP.cpp b/src/stream/ADD-OMP.cpp index 137ce77a6..f20bc8948 100644 --- a/src/stream/ADD-OMP.cpp +++ b/src/stream/ADD-OMP.cpp @@ -18,7 +18,7 @@ namespace stream { -void ADD::runOpenMPVariant(VariantID vid) +void ADD::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/stream/ADD-OMPTarget.cpp b/src/stream/ADD-OMPTarget.cpp index 9c367b1b0..2df9e17a6 100644 --- a/src/stream/ADD-OMPTarget.cpp +++ b/src/stream/ADD-OMPTarget.cpp @@ -40,7 +40,7 @@ namespace stream deallocOpenMPDeviceData(b, did); \ deallocOpenMPDeviceData(c, did); -void ADD::runOpenMPTargetVariant(VariantID vid) +void ADD::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/ADD-Seq.cpp b/src/stream/ADD-Seq.cpp index 89f989d95..83d71065f 100644 --- a/src/stream/ADD-Seq.cpp +++ b/src/stream/ADD-Seq.cpp @@ -18,7 +18,7 @@ namespace stream { -void ADD::runSeqVariant(VariantID vid) +void ADD::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp index da7e1b443..7450b0b4b 100644 --- a/src/stream/ADD.cpp +++ b/src/stream/ADD.cpp @@ -21,10 +21,6 @@ namespace stream ADD::ADD(const RunParams& params) : KernelBase(rajaperf::Stream_ADD, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(1000); @@ -62,19 +58,19 @@ ADD::~ADD() { } -void ADD::setUp(VariantID vid) +void ADD::setUp(VariantID vid, size_t /*tid*/) { allocAndInitData(m_a, getActualProblemSize(), vid); allocAndInitData(m_b, getActualProblemSize(), vid); allocAndInitDataConst(m_c, getActualProblemSize(), 0.0, vid); } -void ADD::updateChecksum(VariantID vid) +void ADD::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_c, getActualProblemSize()); } -void ADD::tearDown(VariantID vid) +void ADD::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_a); diff --git a/src/stream/ADD.hpp b/src/stream/ADD.hpp index db621b309..69b8dfe64 100644 --- a/src/stream/ADD.hpp +++ b/src/stream/ADD.hpp @@ -43,17 +43,18 @@ class ADD : public KernelBase ~ADD(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/stream/COPY-Cuda.cpp b/src/stream/COPY-Cuda.cpp index 8278ba637..4f5168c81 100644 --- a/src/stream/COPY-Cuda.cpp +++ b/src/stream/COPY-Cuda.cpp @@ -109,7 +109,7 @@ void COPY::runCudaVariantImpl(VariantID vid) } } -void COPY::runCudaVariant(VariantID vid) +void COPY::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/stream/COPY-Hip.cpp b/src/stream/COPY-Hip.cpp index 97843f831..336a0f2f2 100644 --- a/src/stream/COPY-Hip.cpp +++ b/src/stream/COPY-Hip.cpp @@ -111,7 +111,7 @@ void COPY::runHipVariantImpl(VariantID vid) } } -void COPY::runHipVariant(VariantID vid) +void COPY::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/stream/COPY-OMP.cpp b/src/stream/COPY-OMP.cpp index fe35d5288..6e0cbb9e3 100644 --- a/src/stream/COPY-OMP.cpp +++ b/src/stream/COPY-OMP.cpp @@ -18,7 +18,7 @@ namespace stream { -void COPY::runOpenMPVariant(VariantID vid) +void COPY::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/stream/COPY-OMPTarget.cpp b/src/stream/COPY-OMPTarget.cpp index 010456eb0..95f1aca91 100644 --- a/src/stream/COPY-OMPTarget.cpp +++ b/src/stream/COPY-OMPTarget.cpp @@ -39,7 +39,7 @@ namespace stream deallocOpenMPDeviceData(c, did); -void COPY::runOpenMPTargetVariant(VariantID vid) +void COPY::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/COPY-Seq.cpp b/src/stream/COPY-Seq.cpp index 89f9cae33..792ac30f1 100644 --- a/src/stream/COPY-Seq.cpp +++ b/src/stream/COPY-Seq.cpp @@ -18,7 +18,7 @@ namespace stream { -void COPY::runSeqVariant(VariantID vid) +void COPY::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp index 737642ca4..04adf7397 100644 --- a/src/stream/COPY.cpp +++ b/src/stream/COPY.cpp @@ -21,10 +21,6 @@ namespace stream COPY::COPY(const RunParams& params) : KernelBase(rajaperf::Stream_COPY, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(1800); @@ -62,18 +58,18 @@ COPY::~COPY() { } -void COPY::setUp(VariantID vid) +void COPY::setUp(VariantID vid, size_t /*tid*/) { allocAndInitData(m_a, getActualProblemSize(), vid); allocAndInitDataConst(m_c, getActualProblemSize(), 0.0, vid); } -void COPY::updateChecksum(VariantID vid) +void COPY::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_c, getActualProblemSize()); } -void COPY::tearDown(VariantID vid) +void COPY::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_a); diff --git a/src/stream/COPY.hpp b/src/stream/COPY.hpp index 1ae3f228c..891f4eca3 100644 --- a/src/stream/COPY.hpp +++ b/src/stream/COPY.hpp @@ -42,17 +42,18 @@ class COPY : public KernelBase ~COPY(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index 243e0bead..17811d085 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -130,7 +130,7 @@ void DOT::runCudaVariantImpl(VariantID vid) } } -void DOT::runCudaVariant(VariantID vid) +void DOT::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index 6d1b3f758..ee52661a9 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -132,7 +132,7 @@ void DOT::runHipVariantImpl(VariantID vid) } } -void DOT::runHipVariant(VariantID vid) +void DOT::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/stream/DOT-OMP.cpp b/src/stream/DOT-OMP.cpp index 24a29d9a0..ef2ee3453 100644 --- a/src/stream/DOT-OMP.cpp +++ b/src/stream/DOT-OMP.cpp @@ -18,7 +18,7 @@ namespace stream { -void DOT::runOpenMPVariant(VariantID vid) +void DOT::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/stream/DOT-OMPTarget.cpp b/src/stream/DOT-OMPTarget.cpp index 1b4cb85cf..35449a8c0 100644 --- a/src/stream/DOT-OMPTarget.cpp +++ b/src/stream/DOT-OMPTarget.cpp @@ -37,7 +37,7 @@ namespace stream deallocOpenMPDeviceData(a, did); \ deallocOpenMPDeviceData(b, did); -void DOT::runOpenMPTargetVariant(VariantID vid) +void DOT::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/DOT-Seq.cpp b/src/stream/DOT-Seq.cpp index 81cff4c1b..1e11d2b6b 100644 --- a/src/stream/DOT-Seq.cpp +++ b/src/stream/DOT-Seq.cpp @@ -18,7 +18,7 @@ namespace stream { -void DOT::runSeqVariant(VariantID vid) +void DOT::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp index 2e0a29b1a..1e5ad74e2 100644 --- a/src/stream/DOT.cpp +++ b/src/stream/DOT.cpp @@ -21,10 +21,6 @@ namespace stream DOT::DOT(const RunParams& params) : KernelBase(rajaperf::Stream_DOT, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(2000); @@ -62,7 +58,7 @@ DOT::~DOT() { } -void DOT::setUp(VariantID vid) +void DOT::setUp(VariantID vid, size_t /*tid*/) { allocAndInitData(m_a, getActualProblemSize(), vid); allocAndInitData(m_b, getActualProblemSize(), vid); @@ -71,12 +67,12 @@ void DOT::setUp(VariantID vid) m_dot_init = 0.0; } -void DOT::updateChecksum(VariantID vid) +void DOT::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += m_dot; } -void DOT::tearDown(VariantID vid) +void DOT::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_a); diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp index 98a1f3c37..b6e698ca8 100644 --- a/src/stream/DOT.hpp +++ b/src/stream/DOT.hpp @@ -42,17 +42,18 @@ class DOT : public KernelBase ~DOT(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/stream/MUL-Cuda.cpp b/src/stream/MUL-Cuda.cpp index b8318e655..3cb59bd72 100644 --- a/src/stream/MUL-Cuda.cpp +++ b/src/stream/MUL-Cuda.cpp @@ -109,7 +109,7 @@ void MUL::runCudaVariantImpl(VariantID vid) } } -void MUL::runCudaVariant(VariantID vid) +void MUL::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/stream/MUL-Hip.cpp b/src/stream/MUL-Hip.cpp index 33cb70a3e..e2d261525 100644 --- a/src/stream/MUL-Hip.cpp +++ b/src/stream/MUL-Hip.cpp @@ -111,7 +111,7 @@ void MUL::runHipVariantImpl(VariantID vid) } } -void MUL::runHipVariant(VariantID vid) +void MUL::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/stream/MUL-OMP.cpp b/src/stream/MUL-OMP.cpp index 0b7f3cd85..758dae146 100644 --- a/src/stream/MUL-OMP.cpp +++ b/src/stream/MUL-OMP.cpp @@ -18,7 +18,7 @@ namespace stream { -void MUL::runOpenMPVariant(VariantID vid) +void MUL::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/stream/MUL-OMPTarget.cpp b/src/stream/MUL-OMPTarget.cpp index 8e5f52b35..5675e88b1 100644 --- a/src/stream/MUL-OMPTarget.cpp +++ b/src/stream/MUL-OMPTarget.cpp @@ -38,7 +38,7 @@ namespace stream deallocOpenMPDeviceData(b, did); \ deallocOpenMPDeviceData(c, did); -void MUL::runOpenMPTargetVariant(VariantID vid) +void MUL::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/MUL-Seq.cpp b/src/stream/MUL-Seq.cpp index 69b548e69..5d2b9f9e0 100644 --- a/src/stream/MUL-Seq.cpp +++ b/src/stream/MUL-Seq.cpp @@ -18,7 +18,7 @@ namespace stream { -void MUL::runSeqVariant(VariantID vid) +void MUL::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp index e0c94dd8f..7e3ca9aad 100644 --- a/src/stream/MUL.cpp +++ b/src/stream/MUL.cpp @@ -21,10 +21,6 @@ namespace stream MUL::MUL(const RunParams& params) : KernelBase(rajaperf::Stream_MUL, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(1800); @@ -62,19 +58,19 @@ MUL::~MUL() { } -void MUL::setUp(VariantID vid) +void MUL::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_b, getActualProblemSize(), 0.0, vid); allocAndInitData(m_c, getActualProblemSize(), vid); initData(m_alpha, vid); } -void MUL::updateChecksum(VariantID vid) +void MUL::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_b, getActualProblemSize()); } -void MUL::tearDown(VariantID vid) +void MUL::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_b); diff --git a/src/stream/MUL.hpp b/src/stream/MUL.hpp index f97adc2ff..7e24edc8c 100644 --- a/src/stream/MUL.hpp +++ b/src/stream/MUL.hpp @@ -43,17 +43,18 @@ class MUL : public KernelBase ~MUL(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/stream/TRIAD-Cuda.cpp b/src/stream/TRIAD-Cuda.cpp index 8ad87ae1b..42c161574 100644 --- a/src/stream/TRIAD-Cuda.cpp +++ b/src/stream/TRIAD-Cuda.cpp @@ -111,7 +111,7 @@ void TRIAD::runCudaVariantImpl(VariantID vid) } } -void TRIAD::runCudaVariant(VariantID vid) +void TRIAD::runCudaVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/stream/TRIAD-Hip.cpp b/src/stream/TRIAD-Hip.cpp index 4fefbbc85..11aaebd4d 100644 --- a/src/stream/TRIAD-Hip.cpp +++ b/src/stream/TRIAD-Hip.cpp @@ -113,7 +113,7 @@ void TRIAD::runHipVariantImpl(VariantID vid) } } -void TRIAD::runHipVariant(VariantID vid) +void TRIAD::runHipVariant(VariantID vid, size_t tid) { if ( !gpu_block_size::invoke_or( gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { diff --git a/src/stream/TRIAD-OMP.cpp b/src/stream/TRIAD-OMP.cpp index 9ce330c00..aa9831592 100644 --- a/src/stream/TRIAD-OMP.cpp +++ b/src/stream/TRIAD-OMP.cpp @@ -18,7 +18,7 @@ namespace stream { -void TRIAD::runOpenMPVariant(VariantID vid) +void TRIAD::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/stream/TRIAD-OMPTarget.cpp b/src/stream/TRIAD-OMPTarget.cpp index 404444366..0353aa31f 100644 --- a/src/stream/TRIAD-OMPTarget.cpp +++ b/src/stream/TRIAD-OMPTarget.cpp @@ -40,7 +40,7 @@ namespace stream deallocOpenMPDeviceData(b, did); \ deallocOpenMPDeviceData(c, did); -void TRIAD::runOpenMPTargetVariant(VariantID vid) +void TRIAD::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/TRIAD-Seq.cpp b/src/stream/TRIAD-Seq.cpp index 7d7800556..3c2eb6a5c 100644 --- a/src/stream/TRIAD-Seq.cpp +++ b/src/stream/TRIAD-Seq.cpp @@ -18,7 +18,7 @@ namespace stream { -void TRIAD::runSeqVariant(VariantID vid) +void TRIAD::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp index dd0ae17fa..4e813fdc0 100644 --- a/src/stream/TRIAD.cpp +++ b/src/stream/TRIAD.cpp @@ -21,10 +21,6 @@ namespace stream TRIAD::TRIAD(const RunParams& params) : KernelBase(rajaperf::Stream_TRIAD, params) { - setDefaultGPUBlockSize( gpu_block_size::get_default_or_first(default_gpu_block_size, gpu_block_sizes_type()) ); - setActualGPUBlockSize( (params.getGPUBlockSize() > 0) ? params.getGPUBlockSize() - : getDefaultGPUBlockSize() ); - setDefaultProblemSize(1000000); setDefaultReps(1000); @@ -66,7 +62,7 @@ TRIAD::~TRIAD() { } -void TRIAD::setUp(VariantID vid) +void TRIAD::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_a, getActualProblemSize(), 0.0, vid); allocAndInitData(m_b, getActualProblemSize(), vid); @@ -74,12 +70,12 @@ void TRIAD::setUp(VariantID vid) initData(m_alpha, vid); } -void TRIAD::updateChecksum(VariantID vid) +void TRIAD::updateChecksum(VariantID vid, size_t tid) { checksum[vid] += calcChecksum(m_a, getActualProblemSize(), checksum_scale_factor ); } -void TRIAD::tearDown(VariantID vid) +void TRIAD::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_a); diff --git a/src/stream/TRIAD.hpp b/src/stream/TRIAD.hpp index 8c37b969c..c6789f2b0 100644 --- a/src/stream/TRIAD.hpp +++ b/src/stream/TRIAD.hpp @@ -44,17 +44,18 @@ class TRIAD : public KernelBase ~TRIAD(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); - - bool isGPUBlockSizeSupported() const; + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > From aecba2b3f7cee5038fe48cb5f34fe714e51d0f98 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 23 Feb 2022 11:02:34 -0800 Subject: [PATCH 213/392] Implement new runCudaVariant and setCudaTuningDefinitions --- src/apps/DEL_DOT_VEC_2D-Cuda.cpp | 25 +++++++++++++++---- src/apps/DEL_DOT_VEC_2D-Hip.cpp | 25 +++++++++++++++---- src/apps/ENERGY-Cuda.cpp | 25 +++++++++++++++---- src/apps/ENERGY-Hip.cpp | 25 +++++++++++++++---- src/apps/FIR-Cuda.cpp | 25 +++++++++++++++---- src/apps/FIR-Hip.cpp | 25 +++++++++++++++---- src/apps/HALOEXCHANGE-Cuda.cpp | 25 +++++++++++++++---- src/apps/HALOEXCHANGE-Hip.cpp | 25 +++++++++++++++---- src/apps/HALOEXCHANGE_FUSED-Cuda.cpp | 25 +++++++++++++++---- src/apps/HALOEXCHANGE_FUSED-Hip.cpp | 25 +++++++++++++++---- src/apps/LTIMES-Cuda.cpp | 25 +++++++++++++++---- src/apps/LTIMES-Hip.cpp | 25 +++++++++++++++---- src/apps/LTIMES_NOVIEW-Cuda.cpp | 25 +++++++++++++++---- src/apps/LTIMES_NOVIEW-Hip.cpp | 25 +++++++++++++++---- src/apps/PRESSURE-Cuda.cpp | 25 +++++++++++++++---- src/apps/PRESSURE-Hip.cpp | 25 +++++++++++++++---- src/apps/VOL3D-Cuda.cpp | 25 +++++++++++++++---- src/apps/VOL3D-Hip.cpp | 25 +++++++++++++++---- src/basic/DAXPY-Hip.cpp | 25 +++++++++++++++---- src/basic/IF_QUAD-Cuda.cpp | 25 +++++++++++++++---- src/basic/IF_QUAD-Hip.cpp | 25 +++++++++++++++---- src/basic/INIT3-Cuda.cpp | 25 +++++++++++++++---- src/basic/INIT3-Hip.cpp | 25 +++++++++++++++---- src/basic/INIT_VIEW1D-Cuda.cpp | 25 +++++++++++++++---- src/basic/INIT_VIEW1D-Hip.cpp | 25 +++++++++++++++---- src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp | 25 +++++++++++++++---- src/basic/INIT_VIEW1D_OFFSET-Hip.cpp | 25 +++++++++++++++---- src/basic/MAT_MAT_SHARED-Cuda.cpp | 25 +++++++++++++++---- src/basic/MAT_MAT_SHARED-Hip.cpp | 25 +++++++++++++++---- src/basic/MULADDSUB-Cuda.cpp | 25 +++++++++++++++---- src/basic/MULADDSUB-Hip.cpp | 25 +++++++++++++++---- src/basic/NESTED_INIT-Cuda.cpp | 25 +++++++++++++++---- src/basic/NESTED_INIT-Hip.cpp | 25 +++++++++++++++---- src/basic/PI_ATOMIC-Cuda.cpp | 25 +++++++++++++++---- src/basic/PI_ATOMIC-Hip.cpp | 25 +++++++++++++++---- src/basic/PI_REDUCE-Cuda.cpp | 25 +++++++++++++++---- src/basic/PI_REDUCE-Hip.cpp | 25 +++++++++++++++---- src/basic/REDUCE3_INT-Cuda.cpp | 25 +++++++++++++++---- src/basic/REDUCE3_INT-Hip.cpp | 25 +++++++++++++++---- src/basic/TRAP_INT-Cuda.cpp | 25 +++++++++++++++---- src/basic/TRAP_INT-Hip.cpp | 25 +++++++++++++++---- src/lcals/DIFF_PREDICT-Cuda.cpp | 25 +++++++++++++++---- src/lcals/DIFF_PREDICT-Hip.cpp | 25 +++++++++++++++---- src/lcals/EOS-Cuda.cpp | 25 +++++++++++++++---- src/lcals/EOS-Hip.cpp | 25 +++++++++++++++---- src/lcals/FIRST_DIFF-Cuda.cpp | 25 +++++++++++++++---- src/lcals/FIRST_DIFF-Hip.cpp | 25 +++++++++++++++---- src/lcals/FIRST_MIN-Cuda.cpp | 25 +++++++++++++++---- src/lcals/FIRST_MIN-Hip.cpp | 25 +++++++++++++++---- src/lcals/FIRST_SUM-Cuda.cpp | 25 +++++++++++++++---- src/lcals/FIRST_SUM-Hip.cpp | 25 +++++++++++++++---- src/lcals/GEN_LIN_RECUR-Cuda.cpp | 25 +++++++++++++++---- src/lcals/GEN_LIN_RECUR-Hip.cpp | 25 +++++++++++++++---- src/lcals/HYDRO_1D-Cuda.cpp | 25 +++++++++++++++---- src/lcals/HYDRO_1D-Hip.cpp | 25 +++++++++++++++---- src/lcals/HYDRO_2D-Cuda.cpp | 25 +++++++++++++++---- src/lcals/HYDRO_2D-Hip.cpp | 25 +++++++++++++++---- src/lcals/INT_PREDICT-Cuda.cpp | 25 +++++++++++++++---- src/lcals/INT_PREDICT-Hip.cpp | 25 +++++++++++++++---- src/lcals/PLANCKIAN-Cuda.cpp | 25 +++++++++++++++---- src/lcals/PLANCKIAN-Hip.cpp | 25 +++++++++++++++---- src/lcals/TRIDIAG_ELIM-Cuda.cpp | 25 +++++++++++++++---- src/lcals/TRIDIAG_ELIM-Hip.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_2MM-Cuda.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_2MM-Hip.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_3MM-Cuda.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_3MM-Hip.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_ADI-Cuda.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_ADI-Hip.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_ATAX-Cuda.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_ATAX-Hip.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_FDTD_2D-Hip.cpp | 25 +++++++++++++++---- .../POLYBENCH_FLOYD_WARSHALL-Cuda.cpp | 25 +++++++++++++++---- .../POLYBENCH_FLOYD_WARSHALL-Hip.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_GEMM-Cuda.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_GEMM-Hip.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_GEMVER-Cuda.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_GEMVER-Hip.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_GESUMMV-Cuda.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_GESUMMV-Hip.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_HEAT_3D-Hip.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_MVT-Cuda.cpp | 25 +++++++++++++++---- src/polybench/POLYBENCH_MVT-Hip.cpp | 25 +++++++++++++++---- src/stream/ADD-Cuda.cpp | 25 +++++++++++++++---- src/stream/ADD-Hip.cpp | 25 +++++++++++++++---- src/stream/COPY-Cuda.cpp | 25 +++++++++++++++---- src/stream/COPY-Hip.cpp | 25 +++++++++++++++---- src/stream/DOT-Cuda.cpp | 25 +++++++++++++++---- src/stream/DOT-Hip.cpp | 25 +++++++++++++++---- src/stream/MUL-Cuda.cpp | 25 +++++++++++++++---- src/stream/MUL-Hip.cpp | 25 +++++++++++++++---- src/stream/TRIAD-Cuda.cpp | 25 +++++++++++++++---- src/stream/TRIAD-Hip.cpp | 25 +++++++++++++++---- 99 files changed, 1980 insertions(+), 495 deletions(-) diff --git a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp index 2fb5a77f5..0321add45 100644 --- a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp @@ -164,11 +164,26 @@ void DEL_DOT_VEC_2D::runCudaVariantImpl(VariantID vid) void DEL_DOT_VEC_2D::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n DEL_DOT_VEC_2D : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void DEL_DOT_VEC_2D::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace apps diff --git a/src/apps/DEL_DOT_VEC_2D-Hip.cpp b/src/apps/DEL_DOT_VEC_2D-Hip.cpp index 0b0d6bca5..9c726e382 100644 --- a/src/apps/DEL_DOT_VEC_2D-Hip.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Hip.cpp @@ -166,11 +166,26 @@ void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid) void DEL_DOT_VEC_2D::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n DEL_DOT_VEC_2D : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void DEL_DOT_VEC_2D::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace apps diff --git a/src/apps/ENERGY-Cuda.cpp b/src/apps/ENERGY-Cuda.cpp index 3d4d98662..250fd0507 100644 --- a/src/apps/ENERGY-Cuda.cpp +++ b/src/apps/ENERGY-Cuda.cpp @@ -270,11 +270,26 @@ void ENERGY::runCudaVariantImpl(VariantID vid) void ENERGY::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n ENERGY : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void ENERGY::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace apps diff --git a/src/apps/ENERGY-Hip.cpp b/src/apps/ENERGY-Hip.cpp index 0551fbf49..259e32e45 100644 --- a/src/apps/ENERGY-Hip.cpp +++ b/src/apps/ENERGY-Hip.cpp @@ -264,11 +264,26 @@ void ENERGY::runHipVariantImpl(VariantID vid) void ENERGY::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n ENERGY : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void ENERGY::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace apps diff --git a/src/apps/FIR-Cuda.cpp b/src/apps/FIR-Cuda.cpp index 5e22b1ad1..18ca73049 100644 --- a/src/apps/FIR-Cuda.cpp +++ b/src/apps/FIR-Cuda.cpp @@ -149,11 +149,26 @@ void FIR::runCudaVariantImpl(VariantID vid) void FIR::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n FIR : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void FIR::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace apps diff --git a/src/apps/FIR-Hip.cpp b/src/apps/FIR-Hip.cpp index 066a22368..a7b39e97f 100644 --- a/src/apps/FIR-Hip.cpp +++ b/src/apps/FIR-Hip.cpp @@ -149,11 +149,26 @@ void FIR::runHipVariantImpl(VariantID vid) void FIR::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n FIR : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void FIR::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace apps diff --git a/src/apps/HALOEXCHANGE-Cuda.cpp b/src/apps/HALOEXCHANGE-Cuda.cpp index 36e76e5b9..b20410556 100644 --- a/src/apps/HALOEXCHANGE-Cuda.cpp +++ b/src/apps/HALOEXCHANGE-Cuda.cpp @@ -171,11 +171,26 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) void HALOEXCHANGE::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n HALOEXCHANGE : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void HALOEXCHANGE::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace apps diff --git a/src/apps/HALOEXCHANGE-Hip.cpp b/src/apps/HALOEXCHANGE-Hip.cpp index ec3386859..c52d5d7ad 100644 --- a/src/apps/HALOEXCHANGE-Hip.cpp +++ b/src/apps/HALOEXCHANGE-Hip.cpp @@ -173,11 +173,26 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) void HALOEXCHANGE::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n HALOEXCHANGE : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void HALOEXCHANGE::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace apps diff --git a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp index a9dc12b6f..5fad53890 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp @@ -272,11 +272,26 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) void HALOEXCHANGE_FUSED::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n HALOEXCHANGE_FUSED : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void HALOEXCHANGE_FUSED::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace apps diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp index 15c6fc19a..c36461c8c 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp @@ -275,11 +275,26 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) void HALOEXCHANGE_FUSED::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n HALOEXCHANGE_FUSED : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void HALOEXCHANGE_FUSED::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace apps diff --git a/src/apps/LTIMES-Cuda.cpp b/src/apps/LTIMES-Cuda.cpp index fc64211a1..16a0bc955 100644 --- a/src/apps/LTIMES-Cuda.cpp +++ b/src/apps/LTIMES-Cuda.cpp @@ -190,11 +190,26 @@ void LTIMES::runCudaVariantImpl(VariantID vid) void LTIMES::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n LTIMES : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void LTIMES::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace apps diff --git a/src/apps/LTIMES-Hip.cpp b/src/apps/LTIMES-Hip.cpp index ee2aadba6..90671e12e 100644 --- a/src/apps/LTIMES-Hip.cpp +++ b/src/apps/LTIMES-Hip.cpp @@ -192,11 +192,26 @@ void LTIMES::runHipVariantImpl(VariantID vid) void LTIMES::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n LTIMES : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void LTIMES::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace apps diff --git a/src/apps/LTIMES_NOVIEW-Cuda.cpp b/src/apps/LTIMES_NOVIEW-Cuda.cpp index 9efef1235..03411d43e 100644 --- a/src/apps/LTIMES_NOVIEW-Cuda.cpp +++ b/src/apps/LTIMES_NOVIEW-Cuda.cpp @@ -188,11 +188,26 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid) void LTIMES_NOVIEW::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n LTIMES_NOVIEW : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void LTIMES_NOVIEW::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace apps diff --git a/src/apps/LTIMES_NOVIEW-Hip.cpp b/src/apps/LTIMES_NOVIEW-Hip.cpp index 6d62cd96d..ba66573df 100644 --- a/src/apps/LTIMES_NOVIEW-Hip.cpp +++ b/src/apps/LTIMES_NOVIEW-Hip.cpp @@ -191,11 +191,26 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid) void LTIMES_NOVIEW::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n LTIMES_NOVIEW : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void LTIMES_NOVIEW::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace apps diff --git a/src/apps/PRESSURE-Cuda.cpp b/src/apps/PRESSURE-Cuda.cpp index 6c122e2bc..28f798eaf 100644 --- a/src/apps/PRESSURE-Cuda.cpp +++ b/src/apps/PRESSURE-Cuda.cpp @@ -138,11 +138,26 @@ void PRESSURE::runCudaVariantImpl(VariantID vid) void PRESSURE::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n PRESSURE : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void PRESSURE::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace apps diff --git a/src/apps/PRESSURE-Hip.cpp b/src/apps/PRESSURE-Hip.cpp index 13247fb2a..fa2bb1774 100644 --- a/src/apps/PRESSURE-Hip.cpp +++ b/src/apps/PRESSURE-Hip.cpp @@ -131,11 +131,26 @@ void PRESSURE::runHipVariantImpl(VariantID vid) void PRESSURE::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n PRESSURE : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void PRESSURE::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace apps diff --git a/src/apps/VOL3D-Cuda.cpp b/src/apps/VOL3D-Cuda.cpp index 059988124..237b2a66c 100644 --- a/src/apps/VOL3D-Cuda.cpp +++ b/src/apps/VOL3D-Cuda.cpp @@ -125,11 +125,26 @@ void VOL3D::runCudaVariantImpl(VariantID vid) void VOL3D::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n VOL3D : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void VOL3D::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace apps diff --git a/src/apps/VOL3D-Hip.cpp b/src/apps/VOL3D-Hip.cpp index 00211a181..bea41cb8e 100644 --- a/src/apps/VOL3D-Hip.cpp +++ b/src/apps/VOL3D-Hip.cpp @@ -125,11 +125,26 @@ void VOL3D::runHipVariantImpl(VariantID vid) void VOL3D::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n VOL3D : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void VOL3D::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace apps diff --git a/src/basic/DAXPY-Hip.cpp b/src/basic/DAXPY-Hip.cpp index 386fc27eb..de629438d 100644 --- a/src/basic/DAXPY-Hip.cpp +++ b/src/basic/DAXPY-Hip.cpp @@ -115,11 +115,26 @@ void DAXPY::runHipVariantImpl(VariantID vid) void DAXPY::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n DAXPY : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void DAXPY::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/basic/IF_QUAD-Cuda.cpp b/src/basic/IF_QUAD-Cuda.cpp index adc13e08d..076bee112 100644 --- a/src/basic/IF_QUAD-Cuda.cpp +++ b/src/basic/IF_QUAD-Cuda.cpp @@ -119,11 +119,26 @@ void IF_QUAD::runCudaVariantImpl(VariantID vid) void IF_QUAD::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n IF_QUAD : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void IF_QUAD::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/basic/IF_QUAD-Hip.cpp b/src/basic/IF_QUAD-Hip.cpp index 9b0c8e49b..b20fde441 100644 --- a/src/basic/IF_QUAD-Hip.cpp +++ b/src/basic/IF_QUAD-Hip.cpp @@ -122,11 +122,26 @@ void IF_QUAD::runHipVariantImpl(VariantID vid) void IF_QUAD::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n IF_QUAD : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void IF_QUAD::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/basic/INIT3-Cuda.cpp b/src/basic/INIT3-Cuda.cpp index c2f3fa753..c33279aa3 100644 --- a/src/basic/INIT3-Cuda.cpp +++ b/src/basic/INIT3-Cuda.cpp @@ -121,11 +121,26 @@ void INIT3::runCudaVariantImpl(VariantID vid) void INIT3::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n INIT3 : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void INIT3::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/basic/INIT3-Hip.cpp b/src/basic/INIT3-Hip.cpp index c16b1e546..c62e04c65 100644 --- a/src/basic/INIT3-Hip.cpp +++ b/src/basic/INIT3-Hip.cpp @@ -123,11 +123,26 @@ void INIT3::runHipVariantImpl(VariantID vid) void INIT3::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n INIT3 : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void INIT3::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/basic/INIT_VIEW1D-Cuda.cpp b/src/basic/INIT_VIEW1D-Cuda.cpp index 30f964406..ef52b3fb9 100644 --- a/src/basic/INIT_VIEW1D-Cuda.cpp +++ b/src/basic/INIT_VIEW1D-Cuda.cpp @@ -112,11 +112,26 @@ void INIT_VIEW1D::runCudaVariantImpl(VariantID vid) void INIT_VIEW1D::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n INIT_VIEW1D : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void INIT_VIEW1D::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/basic/INIT_VIEW1D-Hip.cpp b/src/basic/INIT_VIEW1D-Hip.cpp index 415c66b5d..33a00315f 100644 --- a/src/basic/INIT_VIEW1D-Hip.cpp +++ b/src/basic/INIT_VIEW1D-Hip.cpp @@ -115,11 +115,26 @@ void INIT_VIEW1D::runHipVariantImpl(VariantID vid) void INIT_VIEW1D::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n INIT_VIEW1D : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void INIT_VIEW1D::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp index fe331db1c..2db8c0472 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp @@ -115,11 +115,26 @@ void INIT_VIEW1D_OFFSET::runCudaVariantImpl(VariantID vid) void INIT_VIEW1D_OFFSET::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n INIT_VIEW1D_OFFSET : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void INIT_VIEW1D_OFFSET::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp index 40d061b18..6ffc406b8 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp @@ -116,11 +116,26 @@ void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid) void INIT_VIEW1D_OFFSET::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n INIT_VIEW1D_OFFSET : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void INIT_VIEW1D_OFFSET::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/basic/MAT_MAT_SHARED-Cuda.cpp b/src/basic/MAT_MAT_SHARED-Cuda.cpp index 7d143a928..221b3f400 100644 --- a/src/basic/MAT_MAT_SHARED-Cuda.cpp +++ b/src/basic/MAT_MAT_SHARED-Cuda.cpp @@ -307,11 +307,26 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) void MAT_MAT_SHARED::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n MAT_MAT_SHARED : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void MAT_MAT_SHARED::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/basic/MAT_MAT_SHARED-Hip.cpp b/src/basic/MAT_MAT_SHARED-Hip.cpp index 1e6bb9f24..4f82e3fb5 100644 --- a/src/basic/MAT_MAT_SHARED-Hip.cpp +++ b/src/basic/MAT_MAT_SHARED-Hip.cpp @@ -310,11 +310,26 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) void MAT_MAT_SHARED::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n MAT_MAT_SHARED : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void MAT_MAT_SHARED::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/basic/MULADDSUB-Cuda.cpp b/src/basic/MULADDSUB-Cuda.cpp index 402efdc84..e6c50f277 100644 --- a/src/basic/MULADDSUB-Cuda.cpp +++ b/src/basic/MULADDSUB-Cuda.cpp @@ -121,11 +121,26 @@ void MULADDSUB::runCudaVariantImpl(VariantID vid) void MULADDSUB::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n MULADDSUB : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void MULADDSUB::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/basic/MULADDSUB-Hip.cpp b/src/basic/MULADDSUB-Hip.cpp index dcf579575..144161429 100644 --- a/src/basic/MULADDSUB-Hip.cpp +++ b/src/basic/MULADDSUB-Hip.cpp @@ -123,11 +123,26 @@ void MULADDSUB::runHipVariantImpl(VariantID vid) void MULADDSUB::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n MULADDSUB : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void MULADDSUB::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/basic/NESTED_INIT-Cuda.cpp b/src/basic/NESTED_INIT-Cuda.cpp index 0abde5ee4..6dfc548f8 100644 --- a/src/basic/NESTED_INIT-Cuda.cpp +++ b/src/basic/NESTED_INIT-Cuda.cpp @@ -174,11 +174,26 @@ void NESTED_INIT::runCudaVariantImpl(VariantID vid) void NESTED_INIT::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n NESTED_INIT : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void NESTED_INIT::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/basic/NESTED_INIT-Hip.cpp b/src/basic/NESTED_INIT-Hip.cpp index 5d9409b82..743b344dc 100644 --- a/src/basic/NESTED_INIT-Hip.cpp +++ b/src/basic/NESTED_INIT-Hip.cpp @@ -176,11 +176,26 @@ void NESTED_INIT::runHipVariantImpl(VariantID vid) void NESTED_INIT::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n NESTED_INIT : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void NESTED_INIT::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/basic/PI_ATOMIC-Cuda.cpp b/src/basic/PI_ATOMIC-Cuda.cpp index 4032d59f0..888ce52c6 100644 --- a/src/basic/PI_ATOMIC-Cuda.cpp +++ b/src/basic/PI_ATOMIC-Cuda.cpp @@ -127,11 +127,26 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) void PI_ATOMIC::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n PI_ATOMIC : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void PI_ATOMIC::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp index 775972364..11b8e350d 100644 --- a/src/basic/PI_ATOMIC-Hip.cpp +++ b/src/basic/PI_ATOMIC-Hip.cpp @@ -129,11 +129,26 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) void PI_ATOMIC::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n PI_ATOMIC : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void PI_ATOMIC::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index efabb5aba..147c50fa6 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -119,11 +119,26 @@ void PI_REDUCE::runCudaVariantImpl(VariantID vid) void PI_REDUCE::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n PI_REDUCE : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void PI_REDUCE::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index fe88ef970..d9bbc2310 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -118,11 +118,26 @@ void PI_REDUCE::runHipVariantImpl(VariantID vid) void PI_REDUCE::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n PI_REDUCE : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void PI_REDUCE::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index a6e69c952..7538c2bcc 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -163,11 +163,26 @@ void REDUCE3_INT::runCudaVariantImpl(VariantID vid) void REDUCE3_INT::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n REDUCE3_INT : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void REDUCE3_INT::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index ad0abdef9..dae55300b 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -163,11 +163,26 @@ void REDUCE3_INT::runHipVariantImpl(VariantID vid) void REDUCE3_INT::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n REDUCE3_INT : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void REDUCE3_INT::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index 717038de5..36a25371b 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -153,11 +153,26 @@ void TRAP_INT::runCudaVariantImpl(VariantID vid) void TRAP_INT::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n TRAP_INT : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void TRAP_INT::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index 4b1c834d7..0ab55f25a 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -152,11 +152,26 @@ void TRAP_INT::runHipVariantImpl(VariantID vid) void TRAP_INT::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n TRAP_INT : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void TRAP_INT::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace basic diff --git a/src/lcals/DIFF_PREDICT-Cuda.cpp b/src/lcals/DIFF_PREDICT-Cuda.cpp index 68adea902..66ea063e9 100644 --- a/src/lcals/DIFF_PREDICT-Cuda.cpp +++ b/src/lcals/DIFF_PREDICT-Cuda.cpp @@ -94,11 +94,26 @@ void DIFF_PREDICT::runCudaVariantImpl(VariantID vid) void DIFF_PREDICT::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n DIFF_PREDICT : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void DIFF_PREDICT::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace lcals diff --git a/src/lcals/DIFF_PREDICT-Hip.cpp b/src/lcals/DIFF_PREDICT-Hip.cpp index 27bf9d348..7ad4cfe52 100644 --- a/src/lcals/DIFF_PREDICT-Hip.cpp +++ b/src/lcals/DIFF_PREDICT-Hip.cpp @@ -94,11 +94,26 @@ void DIFF_PREDICT::runHipVariantImpl(VariantID vid) void DIFF_PREDICT::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n DIFF_PREDICT : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void DIFF_PREDICT::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace lcals diff --git a/src/lcals/EOS-Cuda.cpp b/src/lcals/EOS-Cuda.cpp index 0de00b2d5..4d5417274 100644 --- a/src/lcals/EOS-Cuda.cpp +++ b/src/lcals/EOS-Cuda.cpp @@ -98,11 +98,26 @@ void EOS::runCudaVariantImpl(VariantID vid) void EOS::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n EOS : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void EOS::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace lcals diff --git a/src/lcals/EOS-Hip.cpp b/src/lcals/EOS-Hip.cpp index add8f5e22..6ed0f263c 100644 --- a/src/lcals/EOS-Hip.cpp +++ b/src/lcals/EOS-Hip.cpp @@ -98,11 +98,26 @@ void EOS::runHipVariantImpl(VariantID vid) void EOS::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n EOS : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void EOS::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace lcals diff --git a/src/lcals/FIRST_DIFF-Cuda.cpp b/src/lcals/FIRST_DIFF-Cuda.cpp index a8e92b781..413882606 100644 --- a/src/lcals/FIRST_DIFF-Cuda.cpp +++ b/src/lcals/FIRST_DIFF-Cuda.cpp @@ -92,11 +92,26 @@ void FIRST_DIFF::runCudaVariantImpl(VariantID vid) void FIRST_DIFF::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n FIRST_DIFF : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void FIRST_DIFF::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace lcals diff --git a/src/lcals/FIRST_DIFF-Hip.cpp b/src/lcals/FIRST_DIFF-Hip.cpp index f38333511..8c617c805 100644 --- a/src/lcals/FIRST_DIFF-Hip.cpp +++ b/src/lcals/FIRST_DIFF-Hip.cpp @@ -92,11 +92,26 @@ void FIRST_DIFF::runHipVariantImpl(VariantID vid) void FIRST_DIFF::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n FIRST_DIFF : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void FIRST_DIFF::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace lcals diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp index 0d16e24d6..9befa1b5e 100644 --- a/src/lcals/FIRST_MIN-Cuda.cpp +++ b/src/lcals/FIRST_MIN-Cuda.cpp @@ -132,11 +132,26 @@ void FIRST_MIN::runCudaVariantImpl(VariantID vid) void FIRST_MIN::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n FIRST_MIN : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void FIRST_MIN::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace lcals diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp index 7aaf2a144..45198c57e 100644 --- a/src/lcals/FIRST_MIN-Hip.cpp +++ b/src/lcals/FIRST_MIN-Hip.cpp @@ -132,11 +132,26 @@ void FIRST_MIN::runHipVariantImpl(VariantID vid) void FIRST_MIN::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n FIRST_MIN : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void FIRST_MIN::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace lcals diff --git a/src/lcals/FIRST_SUM-Cuda.cpp b/src/lcals/FIRST_SUM-Cuda.cpp index 1422977f7..800581050 100644 --- a/src/lcals/FIRST_SUM-Cuda.cpp +++ b/src/lcals/FIRST_SUM-Cuda.cpp @@ -92,11 +92,26 @@ void FIRST_SUM::runCudaVariantImpl(VariantID vid) void FIRST_SUM::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n FIRST_SUM : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void FIRST_SUM::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace lcals diff --git a/src/lcals/FIRST_SUM-Hip.cpp b/src/lcals/FIRST_SUM-Hip.cpp index 04e761a33..54bd2938e 100644 --- a/src/lcals/FIRST_SUM-Hip.cpp +++ b/src/lcals/FIRST_SUM-Hip.cpp @@ -92,11 +92,26 @@ void FIRST_SUM::runHipVariantImpl(VariantID vid) void FIRST_SUM::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n FIRST_SUM : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void FIRST_SUM::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace lcals diff --git a/src/lcals/GEN_LIN_RECUR-Cuda.cpp b/src/lcals/GEN_LIN_RECUR-Cuda.cpp index 31e30a042..05866434e 100644 --- a/src/lcals/GEN_LIN_RECUR-Cuda.cpp +++ b/src/lcals/GEN_LIN_RECUR-Cuda.cpp @@ -121,11 +121,26 @@ void GEN_LIN_RECUR::runCudaVariantImpl(VariantID vid) void GEN_LIN_RECUR::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n GEN_LIN_RECUR : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void GEN_LIN_RECUR::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace lcals diff --git a/src/lcals/GEN_LIN_RECUR-Hip.cpp b/src/lcals/GEN_LIN_RECUR-Hip.cpp index 6d6a2843c..429a09f19 100644 --- a/src/lcals/GEN_LIN_RECUR-Hip.cpp +++ b/src/lcals/GEN_LIN_RECUR-Hip.cpp @@ -123,11 +123,26 @@ void GEN_LIN_RECUR::runHipVariantImpl(VariantID vid) void GEN_LIN_RECUR::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n GEN_LIN_RECUR : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void GEN_LIN_RECUR::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace lcals diff --git a/src/lcals/HYDRO_1D-Cuda.cpp b/src/lcals/HYDRO_1D-Cuda.cpp index ea2874d56..2b955f481 100644 --- a/src/lcals/HYDRO_1D-Cuda.cpp +++ b/src/lcals/HYDRO_1D-Cuda.cpp @@ -96,11 +96,26 @@ void HYDRO_1D::runCudaVariantImpl(VariantID vid) void HYDRO_1D::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n HYDRO_1D : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void HYDRO_1D::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace lcals diff --git a/src/lcals/HYDRO_1D-Hip.cpp b/src/lcals/HYDRO_1D-Hip.cpp index 4947f3a18..ba94370d6 100644 --- a/src/lcals/HYDRO_1D-Hip.cpp +++ b/src/lcals/HYDRO_1D-Hip.cpp @@ -96,11 +96,26 @@ void HYDRO_1D::runHipVariantImpl(VariantID vid) void HYDRO_1D::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n HYDRO_1D : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void HYDRO_1D::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace lcals diff --git a/src/lcals/HYDRO_2D-Cuda.cpp b/src/lcals/HYDRO_2D-Cuda.cpp index a4c39eb77..440e1fec1 100644 --- a/src/lcals/HYDRO_2D-Cuda.cpp +++ b/src/lcals/HYDRO_2D-Cuda.cpp @@ -223,11 +223,26 @@ void HYDRO_2D::runCudaVariantImpl(VariantID vid) void HYDRO_2D::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n HYDRO_2D : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void HYDRO_2D::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace lcals diff --git a/src/lcals/HYDRO_2D-Hip.cpp b/src/lcals/HYDRO_2D-Hip.cpp index b7e8589f7..3c2666463 100644 --- a/src/lcals/HYDRO_2D-Hip.cpp +++ b/src/lcals/HYDRO_2D-Hip.cpp @@ -225,11 +225,26 @@ void HYDRO_2D::runHipVariantImpl(VariantID vid) void HYDRO_2D::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n HYDRO_2D : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void HYDRO_2D::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace lcals diff --git a/src/lcals/INT_PREDICT-Cuda.cpp b/src/lcals/INT_PREDICT-Cuda.cpp index b9b00f969..514d9a8e4 100644 --- a/src/lcals/INT_PREDICT-Cuda.cpp +++ b/src/lcals/INT_PREDICT-Cuda.cpp @@ -97,11 +97,26 @@ void INT_PREDICT::runCudaVariantImpl(VariantID vid) void INT_PREDICT::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n INT_PREDICT : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void INT_PREDICT::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace lcals diff --git a/src/lcals/INT_PREDICT-Hip.cpp b/src/lcals/INT_PREDICT-Hip.cpp index 7e194659e..b28c95222 100644 --- a/src/lcals/INT_PREDICT-Hip.cpp +++ b/src/lcals/INT_PREDICT-Hip.cpp @@ -97,11 +97,26 @@ void INT_PREDICT::runHipVariantImpl(VariantID vid) void INT_PREDICT::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n INT_PREDICT : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void INT_PREDICT::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace lcals diff --git a/src/lcals/PLANCKIAN-Cuda.cpp b/src/lcals/PLANCKIAN-Cuda.cpp index bbafd6b91..95a7b64da 100644 --- a/src/lcals/PLANCKIAN-Cuda.cpp +++ b/src/lcals/PLANCKIAN-Cuda.cpp @@ -101,11 +101,26 @@ void PLANCKIAN::runCudaVariantImpl(VariantID vid) void PLANCKIAN::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n PLANCKIAN : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void PLANCKIAN::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace lcals diff --git a/src/lcals/PLANCKIAN-Hip.cpp b/src/lcals/PLANCKIAN-Hip.cpp index b4369610f..547d415ef 100644 --- a/src/lcals/PLANCKIAN-Hip.cpp +++ b/src/lcals/PLANCKIAN-Hip.cpp @@ -101,11 +101,26 @@ void PLANCKIAN::runHipVariantImpl(VariantID vid) void PLANCKIAN::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n PLANCKIAN : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void PLANCKIAN::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace lcals diff --git a/src/lcals/TRIDIAG_ELIM-Cuda.cpp b/src/lcals/TRIDIAG_ELIM-Cuda.cpp index ed7000d3f..4241ec936 100644 --- a/src/lcals/TRIDIAG_ELIM-Cuda.cpp +++ b/src/lcals/TRIDIAG_ELIM-Cuda.cpp @@ -97,11 +97,26 @@ void TRIDIAG_ELIM::runCudaVariantImpl(VariantID vid) void TRIDIAG_ELIM::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n TRIDIAG_ELIM : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void TRIDIAG_ELIM::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace lcals diff --git a/src/lcals/TRIDIAG_ELIM-Hip.cpp b/src/lcals/TRIDIAG_ELIM-Hip.cpp index caeb51d8b..fa63674ea 100644 --- a/src/lcals/TRIDIAG_ELIM-Hip.cpp +++ b/src/lcals/TRIDIAG_ELIM-Hip.cpp @@ -96,11 +96,26 @@ void TRIDIAG_ELIM::runHipVariantImpl(VariantID vid) void TRIDIAG_ELIM::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n TRIDIAG_ELIM : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void TRIDIAG_ELIM::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace lcals diff --git a/src/polybench/POLYBENCH_2MM-Cuda.cpp b/src/polybench/POLYBENCH_2MM-Cuda.cpp index 000292232..fc03446c6 100644 --- a/src/polybench/POLYBENCH_2MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_2MM-Cuda.cpp @@ -277,11 +277,26 @@ void POLYBENCH_2MM::runCudaVariantImpl(VariantID vid) void POLYBENCH_2MM::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_2MM : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_2MM::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_2MM-Hip.cpp b/src/polybench/POLYBENCH_2MM-Hip.cpp index eddc74251..353336865 100644 --- a/src/polybench/POLYBENCH_2MM-Hip.cpp +++ b/src/polybench/POLYBENCH_2MM-Hip.cpp @@ -280,11 +280,26 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) void POLYBENCH_2MM::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_2MM : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_2MM::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_3MM-Cuda.cpp b/src/polybench/POLYBENCH_3MM-Cuda.cpp index 24fc49484..2c434e3a5 100644 --- a/src/polybench/POLYBENCH_3MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_3MM-Cuda.cpp @@ -355,11 +355,26 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid) void POLYBENCH_3MM::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_3MM : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_3MM::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_3MM-Hip.cpp b/src/polybench/POLYBENCH_3MM-Hip.cpp index e8b6884aa..04068680c 100644 --- a/src/polybench/POLYBENCH_3MM-Hip.cpp +++ b/src/polybench/POLYBENCH_3MM-Hip.cpp @@ -360,11 +360,26 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) void POLYBENCH_3MM::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_3MM : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_3MM::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_ADI-Cuda.cpp b/src/polybench/POLYBENCH_ADI-Cuda.cpp index 3a76e6429..baf9949a7 100644 --- a/src/polybench/POLYBENCH_ADI-Cuda.cpp +++ b/src/polybench/POLYBENCH_ADI-Cuda.cpp @@ -250,11 +250,26 @@ void POLYBENCH_ADI::runCudaVariantImpl(VariantID vid) void POLYBENCH_ADI::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_ADI : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_ADI::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_ADI-Hip.cpp b/src/polybench/POLYBENCH_ADI-Hip.cpp index d656dc93c..4d95ddd71 100644 --- a/src/polybench/POLYBENCH_ADI-Hip.cpp +++ b/src/polybench/POLYBENCH_ADI-Hip.cpp @@ -259,11 +259,26 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) void POLYBENCH_ADI::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_ADI : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_ADI::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_ATAX-Cuda.cpp b/src/polybench/POLYBENCH_ATAX-Cuda.cpp index 7ae84e146..fc5e98e89 100644 --- a/src/polybench/POLYBENCH_ATAX-Cuda.cpp +++ b/src/polybench/POLYBENCH_ATAX-Cuda.cpp @@ -232,11 +232,26 @@ void POLYBENCH_ATAX::runCudaVariantImpl(VariantID vid) void POLYBENCH_ATAX::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_ATAX : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_ATAX::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_ATAX-Hip.cpp b/src/polybench/POLYBENCH_ATAX-Hip.cpp index 05ff793e3..1176296d7 100644 --- a/src/polybench/POLYBENCH_ATAX-Hip.cpp +++ b/src/polybench/POLYBENCH_ATAX-Hip.cpp @@ -239,11 +239,26 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid) void POLYBENCH_ATAX::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_ATAX : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_ATAX::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp index e6e07f5c8..1be284204 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp @@ -322,11 +322,26 @@ void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid) void POLYBENCH_FDTD_2D::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_FDTD_2D : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_FDTD_2D::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp index bee3ea28f..8c4689a41 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp @@ -334,11 +334,26 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) void POLYBENCH_FDTD_2D::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_FDTD_2D : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_FDTD_2D::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp index ab84e9d47..9aa453ee1 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp @@ -181,11 +181,26 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariantImpl(VariantID vid) void POLYBENCH_FLOYD_WARSHALL::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_FLOYD_WARSHALL : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_FLOYD_WARSHALL::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp index aa6832bed..cc1214774 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp @@ -185,11 +185,26 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariantImpl(VariantID vid) void POLYBENCH_FLOYD_WARSHALL::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_FLOYD_WARSHALL : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_FLOYD_WARSHALL::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_GEMM-Cuda.cpp b/src/polybench/POLYBENCH_GEMM-Cuda.cpp index 5e2955f3f..53b8c049c 100644 --- a/src/polybench/POLYBENCH_GEMM-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMM-Cuda.cpp @@ -207,11 +207,26 @@ void POLYBENCH_GEMM::runCudaVariantImpl(VariantID vid) void POLYBENCH_GEMM::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_GEMM : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_GEMM::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_GEMM-Hip.cpp b/src/polybench/POLYBENCH_GEMM-Hip.cpp index de096b702..70c3dd87d 100644 --- a/src/polybench/POLYBENCH_GEMM-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMM-Hip.cpp @@ -208,11 +208,26 @@ void POLYBENCH_GEMM::runHipVariantImpl(VariantID vid) void POLYBENCH_GEMM::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_GEMM : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_GEMM::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp index 64005cecb..3da0fca96 100644 --- a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp @@ -344,11 +344,26 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) void POLYBENCH_GEMVER::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_GEMVER : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_GEMVER::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_GEMVER-Hip.cpp b/src/polybench/POLYBENCH_GEMVER-Hip.cpp index be98ed54e..705b4b477 100644 --- a/src/polybench/POLYBENCH_GEMVER-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Hip.cpp @@ -352,11 +352,26 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) void POLYBENCH_GEMVER::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_GEMVER : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_GEMVER::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp index 38093f14d..c33dd709f 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp @@ -139,11 +139,26 @@ void POLYBENCH_GESUMMV::runCudaVariantImpl(VariantID vid) void POLYBENCH_GESUMMV::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_GESUMMV : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_GESUMMV::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp index 7627d1091..394d4ea3a 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp @@ -141,11 +141,26 @@ void POLYBENCH_GESUMMV::runHipVariantImpl(VariantID vid) void POLYBENCH_GESUMMV::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_GESUMMV : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_GESUMMV::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp index 236738bba..0b580a582 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp @@ -223,11 +223,26 @@ void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid) void POLYBENCH_HEAT_3D::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_HEAT_3D : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_HEAT_3D::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp index 070aab354..3bbdbeaf0 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp @@ -229,11 +229,26 @@ void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid) void POLYBENCH_HEAT_3D::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_HEAT_3D : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_HEAT_3D::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp index cec48a1ec..5dc98167d 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp @@ -122,11 +122,26 @@ void POLYBENCH_JACOBI_1D::runCudaVariantImpl(VariantID vid) void POLYBENCH_JACOBI_1D::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_JACOBI_1D : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_JACOBI_1D::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp index bb0e8aa9c..356e3f572 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp @@ -124,11 +124,26 @@ void POLYBENCH_JACOBI_1D::runHipVariantImpl(VariantID vid) void POLYBENCH_JACOBI_1D::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_JACOBI_1D : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_JACOBI_1D::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp index 47697964f..e1069de88 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp @@ -213,11 +213,26 @@ void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid) void POLYBENCH_JACOBI_2D::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_JACOBI_2D : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_JACOBI_2D::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp index fadd87e04..875646316 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp @@ -219,11 +219,26 @@ void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid) void POLYBENCH_JACOBI_2D::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_JACOBI_2D : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_JACOBI_2D::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_MVT-Cuda.cpp b/src/polybench/POLYBENCH_MVT-Cuda.cpp index d22500cf2..2d6b471ad 100644 --- a/src/polybench/POLYBENCH_MVT-Cuda.cpp +++ b/src/polybench/POLYBENCH_MVT-Cuda.cpp @@ -180,11 +180,26 @@ void POLYBENCH_MVT::runCudaVariantImpl(VariantID vid) void POLYBENCH_MVT::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_MVT : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_MVT::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/polybench/POLYBENCH_MVT-Hip.cpp b/src/polybench/POLYBENCH_MVT-Hip.cpp index 2cb5135a1..3d7b1f53b 100644 --- a/src/polybench/POLYBENCH_MVT-Hip.cpp +++ b/src/polybench/POLYBENCH_MVT-Hip.cpp @@ -178,11 +178,26 @@ void POLYBENCH_MVT::runHipVariantImpl(VariantID vid) void POLYBENCH_MVT::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n POLYBENCH_MVT : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void POLYBENCH_MVT::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace polybench diff --git a/src/stream/ADD-Cuda.cpp b/src/stream/ADD-Cuda.cpp index 0875ee939..437b6f584 100644 --- a/src/stream/ADD-Cuda.cpp +++ b/src/stream/ADD-Cuda.cpp @@ -113,11 +113,26 @@ void ADD::runCudaVariantImpl(VariantID vid) void ADD::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n ADD : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void ADD::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace stream diff --git a/src/stream/ADD-Hip.cpp b/src/stream/ADD-Hip.cpp index 12ffe4863..92db32f11 100644 --- a/src/stream/ADD-Hip.cpp +++ b/src/stream/ADD-Hip.cpp @@ -115,11 +115,26 @@ void ADD::runHipVariantImpl(VariantID vid) void ADD::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n ADD : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void ADD::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace stream diff --git a/src/stream/COPY-Cuda.cpp b/src/stream/COPY-Cuda.cpp index 4f5168c81..9bb4cc899 100644 --- a/src/stream/COPY-Cuda.cpp +++ b/src/stream/COPY-Cuda.cpp @@ -111,11 +111,26 @@ void COPY::runCudaVariantImpl(VariantID vid) void COPY::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n COPY : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void COPY::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace stream diff --git a/src/stream/COPY-Hip.cpp b/src/stream/COPY-Hip.cpp index 336a0f2f2..72f6647de 100644 --- a/src/stream/COPY-Hip.cpp +++ b/src/stream/COPY-Hip.cpp @@ -113,11 +113,26 @@ void COPY::runHipVariantImpl(VariantID vid) void COPY::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n COPY : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void COPY::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace stream diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index 17811d085..252de4b11 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -132,11 +132,26 @@ void DOT::runCudaVariantImpl(VariantID vid) void DOT::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n DOT : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void DOT::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace stream diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index ee52661a9..005d5465d 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -134,11 +134,26 @@ void DOT::runHipVariantImpl(VariantID vid) void DOT::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n DOT : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void DOT::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace stream diff --git a/src/stream/MUL-Cuda.cpp b/src/stream/MUL-Cuda.cpp index 3cb59bd72..3761c4761 100644 --- a/src/stream/MUL-Cuda.cpp +++ b/src/stream/MUL-Cuda.cpp @@ -111,11 +111,26 @@ void MUL::runCudaVariantImpl(VariantID vid) void MUL::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n MUL : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void MUL::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace stream diff --git a/src/stream/MUL-Hip.cpp b/src/stream/MUL-Hip.cpp index e2d261525..8f6e20636 100644 --- a/src/stream/MUL-Hip.cpp +++ b/src/stream/MUL-Hip.cpp @@ -113,11 +113,26 @@ void MUL::runHipVariantImpl(VariantID vid) void MUL::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n MUL : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void MUL::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace stream diff --git a/src/stream/TRIAD-Cuda.cpp b/src/stream/TRIAD-Cuda.cpp index 42c161574..f9578f1b8 100644 --- a/src/stream/TRIAD-Cuda.cpp +++ b/src/stream/TRIAD-Cuda.cpp @@ -113,11 +113,26 @@ void TRIAD::runCudaVariantImpl(VariantID vid) void TRIAD::runCudaVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunCudaBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n TRIAD : Unsupported Cuda block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void TRIAD::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace stream diff --git a/src/stream/TRIAD-Hip.cpp b/src/stream/TRIAD-Hip.cpp index 11aaebd4d..a2b191a18 100644 --- a/src/stream/TRIAD-Hip.cpp +++ b/src/stream/TRIAD-Hip.cpp @@ -115,11 +115,26 @@ void TRIAD::runHipVariantImpl(VariantID vid) void TRIAD::runHipVariant(VariantID vid, size_t tid) { - if ( !gpu_block_size::invoke_or( - gpu_block_size::RunHipBlockSize(*this, vid), gpu_block_sizes_type()) ) { - std::cout << "\n TRIAD : Unsupported Hip block_size " << getActualGPUBlockSize() - <<" for variant id = " << vid << std::endl; - } + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void TRIAD::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } } // end namespace stream From 719c25ba901b20961aef67ee5443b91b2ad55485 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 23 Feb 2022 11:15:19 -0800 Subject: [PATCH 214/392] Remove isGPUBlockSizeSupported impls --- src/apps/DEL_DOT_VEC_2D.cpp | 6 ------ src/apps/ENERGY.cpp | 6 ------ src/apps/FIR.cpp | 6 ------ src/apps/HALOEXCHANGE.cpp | 6 ------ src/apps/HALOEXCHANGE_FUSED.cpp | 6 ------ src/apps/LTIMES.cpp | 6 ------ src/apps/LTIMES_NOVIEW.cpp | 6 ------ src/apps/PRESSURE.cpp | 6 ------ src/apps/VOL3D.cpp | 6 ------ src/basic/IF_QUAD.cpp | 6 ------ src/basic/INIT3.cpp | 6 ------ src/basic/INIT_VIEW1D.cpp | 6 ------ src/basic/INIT_VIEW1D_OFFSET.cpp | 6 ------ src/basic/MAT_MAT_SHARED.cpp | 6 ------ src/basic/MULADDSUB.cpp | 6 ------ src/basic/NESTED_INIT.cpp | 6 ------ src/basic/PI_ATOMIC.cpp | 6 ------ src/basic/PI_REDUCE.cpp | 6 ------ src/basic/REDUCE3_INT.cpp | 6 ------ src/basic/TRAP_INT.cpp | 6 ------ src/lcals/DIFF_PREDICT.cpp | 6 ------ src/lcals/EOS.cpp | 6 ------ src/lcals/FIRST_DIFF.cpp | 6 ------ src/lcals/FIRST_MIN.cpp | 6 ------ src/lcals/FIRST_SUM.cpp | 6 ------ src/lcals/GEN_LIN_RECUR.cpp | 6 ------ src/lcals/HYDRO_1D.cpp | 6 ------ src/lcals/HYDRO_2D.cpp | 6 ------ src/lcals/INT_PREDICT.cpp | 6 ------ src/lcals/PLANCKIAN.cpp | 6 ------ src/lcals/TRIDIAG_ELIM.cpp | 6 ------ src/polybench/POLYBENCH_2MM.cpp | 6 ------ src/polybench/POLYBENCH_3MM.cpp | 6 ------ src/polybench/POLYBENCH_ADI.cpp | 6 ------ src/polybench/POLYBENCH_ATAX.cpp | 6 ------ src/polybench/POLYBENCH_FDTD_2D.cpp | 6 ------ src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp | 6 ------ src/polybench/POLYBENCH_GEMM.cpp | 6 ------ src/polybench/POLYBENCH_GEMVER.cpp | 6 ------ src/polybench/POLYBENCH_GESUMMV.cpp | 6 ------ src/polybench/POLYBENCH_HEAT_3D.cpp | 6 ------ src/polybench/POLYBENCH_JACOBI_1D.cpp | 6 ------ src/polybench/POLYBENCH_JACOBI_2D.cpp | 6 ------ src/polybench/POLYBENCH_MVT.cpp | 6 ------ src/stream/ADD.cpp | 6 ------ src/stream/COPY.cpp | 6 ------ src/stream/DOT.cpp | 6 ------ src/stream/MUL.cpp | 6 ------ src/stream/TRIAD.cpp | 6 ------ 49 files changed, 294 deletions(-) diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index f75e44d4e..4d1853c1f 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -103,11 +103,5 @@ void DEL_DOT_VEC_2D::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_div); } -bool DEL_DOT_VEC_2D::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp index 7c3b9b0f4..5cbe89e3e 100644 --- a/src/apps/ENERGY.cpp +++ b/src/apps/ENERGY.cpp @@ -119,11 +119,5 @@ void ENERGY::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_vnewc); } -bool ENERGY::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp index 022175d63..95b50bce5 100644 --- a/src/apps/FIR.cpp +++ b/src/apps/FIR.cpp @@ -81,11 +81,5 @@ void FIR::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_out); } -bool FIR::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE.cpp b/src/apps/HALOEXCHANGE.cpp index b3ce4a434..650d59c74 100644 --- a/src/apps/HALOEXCHANGE.cpp +++ b/src/apps/HALOEXCHANGE.cpp @@ -160,12 +160,6 @@ void HALOEXCHANGE::tearDown(VariantID vid, size_t /*tid*/) m_vars.clear(); } -bool HALOEXCHANGE::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - namespace { struct Extent diff --git a/src/apps/HALOEXCHANGE_FUSED.cpp b/src/apps/HALOEXCHANGE_FUSED.cpp index feb472728..3ec1002f6 100644 --- a/src/apps/HALOEXCHANGE_FUSED.cpp +++ b/src/apps/HALOEXCHANGE_FUSED.cpp @@ -160,12 +160,6 @@ void HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t /*tid*/) m_vars.clear(); } -bool HALOEXCHANGE_FUSED::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - namespace { struct Extent diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp index 9b67a0aaf..11fbe67b9 100644 --- a/src/apps/LTIMES.cpp +++ b/src/apps/LTIMES.cpp @@ -104,11 +104,5 @@ void LTIMES::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_psidat); } -bool LTIMES::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp index 1b5c827fc..1812a3f89 100644 --- a/src/apps/LTIMES_NOVIEW.cpp +++ b/src/apps/LTIMES_NOVIEW.cpp @@ -103,11 +103,5 @@ void LTIMES_NOVIEW::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_psidat); } -bool LTIMES_NOVIEW::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp index ab5721111..f4a792a1b 100644 --- a/src/apps/PRESSURE.cpp +++ b/src/apps/PRESSURE.cpp @@ -88,11 +88,5 @@ void PRESSURE::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_vnewc); } -bool PRESSURE::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp index 494c5f0da..585c9950e 100644 --- a/src/apps/VOL3D.cpp +++ b/src/apps/VOL3D.cpp @@ -102,11 +102,5 @@ void VOL3D::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_vol); } -bool VOL3D::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace apps } // end namespace rajaperf diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp index 1a599931f..e3e960e83 100644 --- a/src/basic/IF_QUAD.cpp +++ b/src/basic/IF_QUAD.cpp @@ -86,11 +86,5 @@ void IF_QUAD::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_x2); } -bool IF_QUAD::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp index 623942044..229a63623 100644 --- a/src/basic/INIT3.cpp +++ b/src/basic/INIT3.cpp @@ -83,11 +83,5 @@ void INIT3::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_in2); } -bool INIT3::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp index fddbeeddf..d1a247989 100644 --- a/src/basic/INIT_VIEW1D.cpp +++ b/src/basic/INIT_VIEW1D.cpp @@ -75,11 +75,5 @@ void INIT_VIEW1D::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_a); } -bool INIT_VIEW1D::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp index 9ebc35193..e31f00406 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET.cpp @@ -75,11 +75,5 @@ void INIT_VIEW1D_OFFSET::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_a); } -bool INIT_VIEW1D_OFFSET::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/MAT_MAT_SHARED.cpp b/src/basic/MAT_MAT_SHARED.cpp index 82a4a07ed..9ad5fdad8 100644 --- a/src/basic/MAT_MAT_SHARED.cpp +++ b/src/basic/MAT_MAT_SHARED.cpp @@ -83,11 +83,5 @@ void MAT_MAT_SHARED::tearDown(VariantID vid, size_t /*tid*/) { deallocData(m_C); } -bool MAT_MAT_SHARED::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp index 91b8bd558..c26de726a 100644 --- a/src/basic/MULADDSUB.cpp +++ b/src/basic/MULADDSUB.cpp @@ -83,11 +83,5 @@ void MULADDSUB::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_in2); } -bool MULADDSUB::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp index bd7efd16b..ce02805f9 100644 --- a/src/basic/NESTED_INIT.cpp +++ b/src/basic/NESTED_INIT.cpp @@ -85,11 +85,5 @@ void NESTED_INIT::tearDown(VariantID vid, size_t /*tid*/) m_array = 0; } -bool NESTED_INIT::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp index 05db0af1c..30eb11384 100644 --- a/src/basic/PI_ATOMIC.cpp +++ b/src/basic/PI_ATOMIC.cpp @@ -77,11 +77,5 @@ void PI_ATOMIC::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_pi); } -bool PI_ATOMIC::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_REDUCE.cpp b/src/basic/PI_REDUCE.cpp index 9c6e6d7e0..5b1666e8d 100644 --- a/src/basic/PI_REDUCE.cpp +++ b/src/basic/PI_REDUCE.cpp @@ -75,11 +75,5 @@ void PI_REDUCE::tearDown(VariantID vid, size_t /*tid*/) (void) vid; } -bool PI_REDUCE::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp index 49fcb8562..c690184e3 100644 --- a/src/basic/REDUCE3_INT.cpp +++ b/src/basic/REDUCE3_INT.cpp @@ -87,11 +87,5 @@ void REDUCE3_INT::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_vec); } -bool REDUCE3_INT::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp index c1ec43ff4..4c5eaaf14 100644 --- a/src/basic/TRAP_INT.cpp +++ b/src/basic/TRAP_INT.cpp @@ -84,11 +84,5 @@ void TRAP_INT::tearDown(VariantID vid, size_t /*tid*/) (void) vid; } -bool TRAP_INT::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace basic } // end namespace rajaperf diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp index 0cce0a80d..1ec58f0a9 100644 --- a/src/lcals/DIFF_PREDICT.cpp +++ b/src/lcals/DIFF_PREDICT.cpp @@ -76,11 +76,5 @@ void DIFF_PREDICT::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_cx); } -bool DIFF_PREDICT::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp index 87d47e605..52a20b23f 100644 --- a/src/lcals/EOS.cpp +++ b/src/lcals/EOS.cpp @@ -89,11 +89,5 @@ void EOS::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_u); } -bool EOS::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp index 0fd301c18..43f22a656 100644 --- a/src/lcals/FIRST_DIFF.cpp +++ b/src/lcals/FIRST_DIFF.cpp @@ -77,11 +77,5 @@ void FIRST_DIFF::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_y); } -bool FIRST_DIFF::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp index 7c1409b4d..5987de250 100644 --- a/src/lcals/FIRST_MIN.cpp +++ b/src/lcals/FIRST_MIN.cpp @@ -83,11 +83,5 @@ void FIRST_MIN::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_x); } -bool FIRST_MIN::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp index a028c1c0f..9e6011f70 100644 --- a/src/lcals/FIRST_SUM.cpp +++ b/src/lcals/FIRST_SUM.cpp @@ -76,11 +76,5 @@ void FIRST_SUM::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_y); } -bool FIRST_SUM::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/GEN_LIN_RECUR.cpp b/src/lcals/GEN_LIN_RECUR.cpp index 1cae28e86..c8822e5d8 100644 --- a/src/lcals/GEN_LIN_RECUR.cpp +++ b/src/lcals/GEN_LIN_RECUR.cpp @@ -87,11 +87,5 @@ void GEN_LIN_RECUR::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_sb); } -bool GEN_LIN_RECUR::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp index 137f70fad..85cafd327 100644 --- a/src/lcals/HYDRO_1D.cpp +++ b/src/lcals/HYDRO_1D.cpp @@ -86,11 +86,5 @@ void HYDRO_1D::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_z); } -bool HYDRO_1D::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp index 8e4f0c8c3..a03ff2cf4 100644 --- a/src/lcals/HYDRO_2D.cpp +++ b/src/lcals/HYDRO_2D.cpp @@ -114,11 +114,5 @@ void HYDRO_2D::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_zz); } -bool HYDRO_2D::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp index a8aa25e1f..a58537eed 100644 --- a/src/lcals/INT_PREDICT.cpp +++ b/src/lcals/INT_PREDICT.cpp @@ -88,11 +88,5 @@ void INT_PREDICT::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_px); } -bool INT_PREDICT::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp index a41ff07bf..193f7d456 100644 --- a/src/lcals/PLANCKIAN.cpp +++ b/src/lcals/PLANCKIAN.cpp @@ -79,11 +79,5 @@ void PLANCKIAN::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_w); } -bool PLANCKIAN::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp index 71def13b2..57302d426 100644 --- a/src/lcals/TRIDIAG_ELIM.cpp +++ b/src/lcals/TRIDIAG_ELIM.cpp @@ -79,11 +79,5 @@ void TRIDIAG_ELIM::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_z); } -bool TRIDIAG_ELIM::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace lcals } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index 59229fe80..d9aaa2d09 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -109,11 +109,5 @@ void POLYBENCH_2MM::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_D); } -bool POLYBENCH_2MM::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp index d7de76e38..f2ae2577e 100644 --- a/src/polybench/POLYBENCH_3MM.cpp +++ b/src/polybench/POLYBENCH_3MM.cpp @@ -121,11 +121,5 @@ void POLYBENCH_3MM::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_G); } -bool POLYBENCH_3MM::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace basic } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp index 28bf02f83..14802ebf8 100644 --- a/src/polybench/POLYBENCH_ADI.cpp +++ b/src/polybench/POLYBENCH_ADI.cpp @@ -91,11 +91,5 @@ void POLYBENCH_ADI::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_Q); } -bool POLYBENCH_ADI::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp index c5a244b3f..c2656a96f 100644 --- a/src/polybench/POLYBENCH_ATAX.cpp +++ b/src/polybench/POLYBENCH_ATAX.cpp @@ -94,11 +94,5 @@ void POLYBENCH_ATAX::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_A); } -bool POLYBENCH_ATAX::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp index 7fb9161db..29e3d1d66 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D.cpp @@ -112,11 +112,5 @@ void POLYBENCH_FDTD_2D::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_hz); } -bool POLYBENCH_FDTD_2D::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp index 4148cbcae..08ddd798c 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp @@ -85,11 +85,5 @@ void POLYBENCH_FLOYD_WARSHALL::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_pout); } -bool POLYBENCH_FLOYD_WARSHALL::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp index b48addd93..bf89f83ce 100644 --- a/src/polybench/POLYBENCH_GEMM.cpp +++ b/src/polybench/POLYBENCH_GEMM.cpp @@ -97,11 +97,5 @@ void POLYBENCH_GEMM::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_C); } -bool POLYBENCH_GEMM::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp index dea7ab038..48eec4ae7 100644 --- a/src/polybench/POLYBENCH_GEMVER.cpp +++ b/src/polybench/POLYBENCH_GEMVER.cpp @@ -119,11 +119,5 @@ void POLYBENCH_GEMVER::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_z); } -bool POLYBENCH_GEMVER::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace basic } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp index 96d688afa..cb0cf3389 100644 --- a/src/polybench/POLYBENCH_GESUMMV.cpp +++ b/src/polybench/POLYBENCH_GESUMMV.cpp @@ -88,11 +88,5 @@ void POLYBENCH_GESUMMV::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_B); } -bool POLYBENCH_GESUMMV::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp index d5e4b579c..fe603915b 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D.cpp @@ -100,11 +100,5 @@ void POLYBENCH_HEAT_3D::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_Binit); } -bool POLYBENCH_HEAT_3D::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp index c181a7b20..102d42cee 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp @@ -97,11 +97,5 @@ void POLYBENCH_JACOBI_1D::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_Binit); } -bool POLYBENCH_JACOBI_1D::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp index 97f910ab1..a3b924c46 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp @@ -99,11 +99,5 @@ void POLYBENCH_JACOBI_2D::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_Binit); } -bool POLYBENCH_JACOBI_2D::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp index 245dfd028..c27ec078a 100644 --- a/src/polybench/POLYBENCH_MVT.cpp +++ b/src/polybench/POLYBENCH_MVT.cpp @@ -94,11 +94,5 @@ void POLYBENCH_MVT::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_A); } -bool POLYBENCH_MVT::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace polybench } // end namespace rajaperf diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp index 7450b0b4b..11dfa7886 100644 --- a/src/stream/ADD.cpp +++ b/src/stream/ADD.cpp @@ -78,11 +78,5 @@ void ADD::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_c); } -bool ADD::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp index 04adf7397..bb337dc68 100644 --- a/src/stream/COPY.cpp +++ b/src/stream/COPY.cpp @@ -76,11 +76,5 @@ void COPY::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_c); } -bool COPY::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp index 1e5ad74e2..7352f7ebf 100644 --- a/src/stream/DOT.cpp +++ b/src/stream/DOT.cpp @@ -79,11 +79,5 @@ void DOT::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_b); } -bool DOT::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp index 7e3ca9aad..8e216d647 100644 --- a/src/stream/MUL.cpp +++ b/src/stream/MUL.cpp @@ -77,11 +77,5 @@ void MUL::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_c); } -bool MUL::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp index 4e813fdc0..da43f1c6a 100644 --- a/src/stream/TRIAD.cpp +++ b/src/stream/TRIAD.cpp @@ -83,11 +83,5 @@ void TRIAD::tearDown(VariantID vid, size_t /*tid*/) deallocData(m_c); } -bool TRIAD::isGPUBlockSizeSupported() const -{ - return gpu_block_size::invoke_or( - gpu_block_size::Equals(getActualGPUBlockSize()), gpu_block_sizes_type()); -} - } // end namespace stream } // end namespace rajaperf From c5ec026bf35659c8a7a870f3b3d793490fb595f8 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 23 Feb 2022 11:16:16 -0800 Subject: [PATCH 215/392] Fixup checksum use --- src/algorithm/SORT.cpp | 2 +- src/algorithm/SORTPAIRS.cpp | 4 ++-- src/apps/DEL_DOT_VEC_2D.cpp | 2 +- src/apps/DIFFUSION3DPA.cpp | 2 +- src/apps/ENERGY.cpp | 4 ++-- src/apps/FIR.cpp | 2 +- src/apps/HALOEXCHANGE.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED.cpp | 2 +- src/apps/LTIMES.cpp | 2 +- src/apps/LTIMES_NOVIEW.cpp | 2 +- src/apps/MASS3DPA.cpp | 2 +- src/apps/PRESSURE.cpp | 2 +- src/apps/VOL3D.cpp | 2 +- src/apps/WIP-COUPLE.cpp | 6 +++--- src/basic/IF_QUAD.cpp | 4 ++-- src/basic/INIT3.cpp | 6 +++--- src/basic/INIT_VIEW1D.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET.cpp | 2 +- src/basic/MAT_MAT_SHARED.cpp | 2 +- src/basic/MULADDSUB.cpp | 6 +++--- src/basic/NESTED_INIT.cpp | 2 +- src/basic/PI_ATOMIC.cpp | 2 +- src/basic/PI_REDUCE.cpp | 2 +- src/basic/REDUCE3_INT.cpp | 6 +++--- src/basic/TRAP_INT.cpp | 2 +- src/lcals/DIFF_PREDICT.cpp | 2 +- src/lcals/EOS.cpp | 2 +- src/lcals/FIRST_DIFF.cpp | 2 +- src/lcals/FIRST_MIN.cpp | 2 +- src/lcals/FIRST_SUM.cpp | 2 +- src/lcals/GEN_LIN_RECUR.cpp | 2 +- src/lcals/HYDRO_1D.cpp | 2 +- src/lcals/HYDRO_2D.cpp | 4 ++-- src/lcals/INT_PREDICT.cpp | 2 +- src/lcals/PLANCKIAN.cpp | 2 +- src/lcals/TRIDIAG_ELIM.cpp | 2 +- src/polybench/POLYBENCH_2MM.cpp | 2 +- src/polybench/POLYBENCH_3MM.cpp | 2 +- src/polybench/POLYBENCH_ADI.cpp | 2 +- src/polybench/POLYBENCH_ATAX.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D.cpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp | 2 +- src/polybench/POLYBENCH_GEMM.cpp | 2 +- src/polybench/POLYBENCH_GEMVER.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D.cpp | 4 ++-- src/polybench/POLYBENCH_JACOBI_1D.cpp | 4 ++-- src/polybench/POLYBENCH_JACOBI_2D.cpp | 4 ++-- src/polybench/POLYBENCH_MVT.cpp | 4 ++-- src/stream/ADD.cpp | 2 +- src/stream/COPY.cpp | 2 +- src/stream/DOT.cpp | 2 +- src/stream/MUL.cpp | 2 +- src/stream/TRIAD.cpp | 2 +- 54 files changed, 70 insertions(+), 70 deletions(-) diff --git a/src/algorithm/SORT.cpp b/src/algorithm/SORT.cpp index 8c2b5dbce..450671e4f 100644 --- a/src/algorithm/SORT.cpp +++ b/src/algorithm/SORT.cpp @@ -54,7 +54,7 @@ void SORT::setUp(VariantID vid, size_t /*tid*/) void SORT::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_x, getActualProblemSize()*getRunReps()); + checksum[vid][tid] += calcChecksum(m_x, getActualProblemSize()*getRunReps()); } void SORT::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/algorithm/SORTPAIRS.cpp b/src/algorithm/SORTPAIRS.cpp index 6a291bb5c..31aa5ff32 100644 --- a/src/algorithm/SORTPAIRS.cpp +++ b/src/algorithm/SORTPAIRS.cpp @@ -55,8 +55,8 @@ void SORTPAIRS::setUp(VariantID vid, size_t /*tid*/) void SORTPAIRS::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_x, getActualProblemSize()*getRunReps()); - checksum[vid] += calcChecksum(m_i, getActualProblemSize()*getRunReps()); + checksum[vid][tid] += calcChecksum(m_x, getActualProblemSize()*getRunReps()); + checksum[vid][tid] += calcChecksum(m_i, getActualProblemSize()*getRunReps()); } void SORTPAIRS::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index 4d1853c1f..7e8e70525 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -89,7 +89,7 @@ void DEL_DOT_VEC_2D::setUp(VariantID vid, size_t /*tid*/) void DEL_DOT_VEC_2D::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_div, m_array_length); + checksum[vid][tid] += calcChecksum(m_div, m_array_length); } void DEL_DOT_VEC_2D::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp index 643d7f987..086d9bc6f 100644 --- a/src/apps/DIFFUSION3DPA.cpp +++ b/src/apps/DIFFUSION3DPA.cpp @@ -83,7 +83,7 @@ void DIFFUSION3DPA::setUp(VariantID vid, size_t /*tid*/) void DIFFUSION3DPA::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_Y, DPA_D1D*DPA_D1D*DPA_D1D*m_NE); + checksum[vid][tid] += calcChecksum(m_Y, DPA_D1D*DPA_D1D*DPA_D1D*m_NE); } void DIFFUSION3DPA::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp index 5cbe89e3e..e36343319 100644 --- a/src/apps/ENERGY.cpp +++ b/src/apps/ENERGY.cpp @@ -94,8 +94,8 @@ void ENERGY::setUp(VariantID vid, size_t /*tid*/) void ENERGY::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_e_new, getActualProblemSize()); - checksum[vid] += calcChecksum(m_q_new, getActualProblemSize()); + checksum[vid][tid] += calcChecksum(m_e_new, getActualProblemSize()); + checksum[vid][tid] += calcChecksum(m_q_new, getActualProblemSize()); } void ENERGY::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp index 95b50bce5..e9a7280b3 100644 --- a/src/apps/FIR.cpp +++ b/src/apps/FIR.cpp @@ -70,7 +70,7 @@ void FIR::setUp(VariantID vid, size_t /*tid*/) void FIR::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_out, getActualProblemSize(), checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_out, getActualProblemSize(), checksum_scale_factor ); } void FIR::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/apps/HALOEXCHANGE.cpp b/src/apps/HALOEXCHANGE.cpp index 650d59c74..6e63ed982 100644 --- a/src/apps/HALOEXCHANGE.cpp +++ b/src/apps/HALOEXCHANGE.cpp @@ -135,7 +135,7 @@ void HALOEXCHANGE::setUp(VariantID vid, size_t /*tid*/) void HALOEXCHANGE::updateChecksum(VariantID vid, size_t tid) { for (Real_ptr var : m_vars) { - checksum[vid] += calcChecksum(var, m_var_size); + checksum[vid][tid] += calcChecksum(var, m_var_size); } } diff --git a/src/apps/HALOEXCHANGE_FUSED.cpp b/src/apps/HALOEXCHANGE_FUSED.cpp index 3ec1002f6..d1f479181 100644 --- a/src/apps/HALOEXCHANGE_FUSED.cpp +++ b/src/apps/HALOEXCHANGE_FUSED.cpp @@ -135,7 +135,7 @@ void HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t /*tid*/) void HALOEXCHANGE_FUSED::updateChecksum(VariantID vid, size_t tid) { for (Real_ptr var : m_vars) { - checksum[vid] += calcChecksum(var, m_var_size); + checksum[vid][tid] += calcChecksum(var, m_var_size); } } diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp index 11fbe67b9..5fd7f4663 100644 --- a/src/apps/LTIMES.cpp +++ b/src/apps/LTIMES.cpp @@ -92,7 +92,7 @@ void LTIMES::setUp(VariantID vid, size_t /*tid*/) void LTIMES::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_phidat, m_philen, checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_phidat, m_philen, checksum_scale_factor ); } void LTIMES::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp index 1812a3f89..77ea8f6ba 100644 --- a/src/apps/LTIMES_NOVIEW.cpp +++ b/src/apps/LTIMES_NOVIEW.cpp @@ -91,7 +91,7 @@ void LTIMES_NOVIEW::setUp(VariantID vid, size_t /*tid*/) void LTIMES_NOVIEW::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_phidat, m_philen, checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_phidat, m_philen, checksum_scale_factor ); } void LTIMES_NOVIEW::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp index 6614d64c8..e21c4ecc5 100644 --- a/src/apps/MASS3DPA.cpp +++ b/src/apps/MASS3DPA.cpp @@ -79,7 +79,7 @@ void MASS3DPA::setUp(VariantID vid, size_t /*tid*/) void MASS3DPA::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_Y, MPA_D1D*MPA_D1D*MPA_D1D*m_NE); + checksum[vid][tid] += calcChecksum(m_Y, MPA_D1D*MPA_D1D*MPA_D1D*m_NE); } void MASS3DPA::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp index f4a792a1b..6a77e71c1 100644 --- a/src/apps/PRESSURE.cpp +++ b/src/apps/PRESSURE.cpp @@ -74,7 +74,7 @@ void PRESSURE::setUp(VariantID vid, size_t /*tid*/) void PRESSURE::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_p_new, getActualProblemSize()); + checksum[vid][tid] += calcChecksum(m_p_new, getActualProblemSize()); } void PRESSURE::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp index 585c9950e..52a17e108 100644 --- a/src/apps/VOL3D.cpp +++ b/src/apps/VOL3D.cpp @@ -89,7 +89,7 @@ void VOL3D::setUp(VariantID vid, size_t /*tid*/) void VOL3D::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_vol, m_array_length, checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_vol, m_array_length, checksum_scale_factor ); } void VOL3D::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/apps/WIP-COUPLE.cpp b/src/apps/WIP-COUPLE.cpp index a53057d63..ec1958299 100644 --- a/src/apps/WIP-COUPLE.cpp +++ b/src/apps/WIP-COUPLE.cpp @@ -183,9 +183,9 @@ void COUPLE::updateChecksum(VariantID vid, size_t tid) { Index_type max_loop_index = m_domain->lrn; - checksum[vid] += calcChecksum(m_t0, max_loop_index); - checksum[vid] += calcChecksum(m_t1, max_loop_index); - checksum[vid] += calcChecksum(m_t2, max_loop_index); + checksum[vid][tid] += calcChecksum(m_t0, max_loop_index); + checksum[vid][tid] += calcChecksum(m_t1, max_loop_index); + checksum[vid][tid] += calcChecksum(m_t2, max_loop_index); } void COUPLE::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp index e3e960e83..5e3a20a19 100644 --- a/src/basic/IF_QUAD.cpp +++ b/src/basic/IF_QUAD.cpp @@ -72,8 +72,8 @@ void IF_QUAD::setUp(VariantID vid, size_t /*tid*/) void IF_QUAD::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_x1, getActualProblemSize(), checksum_scale_factor ); - checksum[vid] += calcChecksum(m_x2, getActualProblemSize(), checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_x1, getActualProblemSize(), checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_x2, getActualProblemSize(), checksum_scale_factor ); } void IF_QUAD::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp index 229a63623..68b5f1755 100644 --- a/src/basic/INIT3.cpp +++ b/src/basic/INIT3.cpp @@ -68,9 +68,9 @@ void INIT3::setUp(VariantID vid, size_t /*tid*/) void INIT3::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_out1, getActualProblemSize()); - checksum[vid] += calcChecksum(m_out2, getActualProblemSize()); - checksum[vid] += calcChecksum(m_out3, getActualProblemSize()); + checksum[vid][tid] += calcChecksum(m_out1, getActualProblemSize()); + checksum[vid][tid] += calcChecksum(m_out2, getActualProblemSize()); + checksum[vid][tid] += calcChecksum(m_out3, getActualProblemSize()); } void INIT3::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp index d1a247989..eb6cdb448 100644 --- a/src/basic/INIT_VIEW1D.cpp +++ b/src/basic/INIT_VIEW1D.cpp @@ -66,7 +66,7 @@ void INIT_VIEW1D::setUp(VariantID vid, size_t /*tid*/) void INIT_VIEW1D::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_a, getActualProblemSize()); + checksum[vid][tid] += calcChecksum(m_a, getActualProblemSize()); } void INIT_VIEW1D::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp index e31f00406..cb7706d88 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET.cpp @@ -66,7 +66,7 @@ void INIT_VIEW1D_OFFSET::setUp(VariantID vid, size_t /*tid*/) void INIT_VIEW1D_OFFSET::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_a, getActualProblemSize()); + checksum[vid][tid] += calcChecksum(m_a, getActualProblemSize()); } void INIT_VIEW1D_OFFSET::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/basic/MAT_MAT_SHARED.cpp b/src/basic/MAT_MAT_SHARED.cpp index 9ad5fdad8..fa92bdb13 100644 --- a/src/basic/MAT_MAT_SHARED.cpp +++ b/src/basic/MAT_MAT_SHARED.cpp @@ -73,7 +73,7 @@ void MAT_MAT_SHARED::setUp(VariantID vid, size_t /*tid*/) { } void MAT_MAT_SHARED::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_C, m_N*m_N, checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_C, m_N*m_N, checksum_scale_factor ); } void MAT_MAT_SHARED::tearDown(VariantID vid, size_t /*tid*/) { diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp index c26de726a..8758b5da4 100644 --- a/src/basic/MULADDSUB.cpp +++ b/src/basic/MULADDSUB.cpp @@ -68,9 +68,9 @@ void MULADDSUB::setUp(VariantID vid, size_t /*tid*/) void MULADDSUB::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_out1, getActualProblemSize()); - checksum[vid] += calcChecksum(m_out2, getActualProblemSize()); - checksum[vid] += calcChecksum(m_out3, getActualProblemSize()); + checksum[vid][tid] += calcChecksum(m_out1, getActualProblemSize()); + checksum[vid][tid] += calcChecksum(m_out2, getActualProblemSize()); + checksum[vid][tid] += calcChecksum(m_out3, getActualProblemSize()); } void MULADDSUB::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp index ce02805f9..9fa24d792 100644 --- a/src/basic/NESTED_INIT.cpp +++ b/src/basic/NESTED_INIT.cpp @@ -75,7 +75,7 @@ void NESTED_INIT::setUp(VariantID vid, size_t /*tid*/) void NESTED_INIT::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_array, m_array_length); + checksum[vid][tid] += calcChecksum(m_array, m_array_length); } void NESTED_INIT::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp index 30eb11384..2f875d701 100644 --- a/src/basic/PI_ATOMIC.cpp +++ b/src/basic/PI_ATOMIC.cpp @@ -68,7 +68,7 @@ void PI_ATOMIC::setUp(VariantID vid, size_t /*tid*/) void PI_ATOMIC::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += Checksum_type(*m_pi); + checksum[vid][tid] += Checksum_type(*m_pi); } void PI_ATOMIC::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/basic/PI_REDUCE.cpp b/src/basic/PI_REDUCE.cpp index 5b1666e8d..dbc9a6725 100644 --- a/src/basic/PI_REDUCE.cpp +++ b/src/basic/PI_REDUCE.cpp @@ -67,7 +67,7 @@ void PI_REDUCE::setUp(VariantID vid, size_t /*tid*/) void PI_REDUCE::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += Checksum_type(m_pi); + checksum[vid][tid] += Checksum_type(m_pi); } void PI_REDUCE::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp index c690184e3..63bc54d5f 100644 --- a/src/basic/REDUCE3_INT.cpp +++ b/src/basic/REDUCE3_INT.cpp @@ -76,9 +76,9 @@ void REDUCE3_INT::setUp(VariantID vid, size_t /*tid*/) void REDUCE3_INT::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += m_vsum; - checksum[vid] += m_vmin; - checksum[vid] += m_vmax; + checksum[vid][tid] += m_vsum; + checksum[vid][tid] += m_vmin; + checksum[vid][tid] += m_vmax; } void REDUCE3_INT::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp index 4c5eaaf14..8602716c4 100644 --- a/src/basic/TRAP_INT.cpp +++ b/src/basic/TRAP_INT.cpp @@ -76,7 +76,7 @@ void TRAP_INT::setUp(VariantID vid, size_t /*tid*/) void TRAP_INT::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += m_sumx; + checksum[vid][tid] += m_sumx; } void TRAP_INT::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp index 1ec58f0a9..a22d93726 100644 --- a/src/lcals/DIFF_PREDICT.cpp +++ b/src/lcals/DIFF_PREDICT.cpp @@ -66,7 +66,7 @@ void DIFF_PREDICT::setUp(VariantID vid, size_t /*tid*/) void DIFF_PREDICT::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_px, m_array_length); + checksum[vid][tid] += calcChecksum(m_px, m_array_length); } void DIFF_PREDICT::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp index 52a20b23f..e8065cad6 100644 --- a/src/lcals/EOS.cpp +++ b/src/lcals/EOS.cpp @@ -77,7 +77,7 @@ void EOS::setUp(VariantID vid, size_t /*tid*/) void EOS::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_x, getActualProblemSize(), checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_x, getActualProblemSize(), checksum_scale_factor ); } void EOS::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp index 43f22a656..92343f10a 100644 --- a/src/lcals/FIRST_DIFF.cpp +++ b/src/lcals/FIRST_DIFF.cpp @@ -67,7 +67,7 @@ void FIRST_DIFF::setUp(VariantID vid, size_t /*tid*/) void FIRST_DIFF::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_x, getActualProblemSize()); + checksum[vid][tid] += calcChecksum(m_x, getActualProblemSize()); } void FIRST_DIFF::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp index 5987de250..196084041 100644 --- a/src/lcals/FIRST_MIN.cpp +++ b/src/lcals/FIRST_MIN.cpp @@ -74,7 +74,7 @@ void FIRST_MIN::setUp(VariantID vid, size_t /*tid*/) void FIRST_MIN::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += static_cast(m_minloc); + checksum[vid][tid] += static_cast(m_minloc); } void FIRST_MIN::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp index 9e6011f70..8e9a61676 100644 --- a/src/lcals/FIRST_SUM.cpp +++ b/src/lcals/FIRST_SUM.cpp @@ -66,7 +66,7 @@ void FIRST_SUM::setUp(VariantID vid, size_t /*tid*/) void FIRST_SUM::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_x, getActualProblemSize()); + checksum[vid][tid] += calcChecksum(m_x, getActualProblemSize()); } void FIRST_SUM::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/lcals/GEN_LIN_RECUR.cpp b/src/lcals/GEN_LIN_RECUR.cpp index c8822e5d8..32c83e18d 100644 --- a/src/lcals/GEN_LIN_RECUR.cpp +++ b/src/lcals/GEN_LIN_RECUR.cpp @@ -75,7 +75,7 @@ void GEN_LIN_RECUR::setUp(VariantID vid, size_t /*tid*/) void GEN_LIN_RECUR::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_b5, getActualProblemSize(), checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_b5, getActualProblemSize(), checksum_scale_factor ); } void GEN_LIN_RECUR::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp index 85cafd327..a28911a20 100644 --- a/src/lcals/HYDRO_1D.cpp +++ b/src/lcals/HYDRO_1D.cpp @@ -75,7 +75,7 @@ void HYDRO_1D::setUp(VariantID vid, size_t /*tid*/) void HYDRO_1D::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_x, getActualProblemSize(), checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_x, getActualProblemSize(), checksum_scale_factor ); } void HYDRO_1D::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp index a03ff2cf4..0badd9d55 100644 --- a/src/lcals/HYDRO_2D.cpp +++ b/src/lcals/HYDRO_2D.cpp @@ -94,8 +94,8 @@ void HYDRO_2D::setUp(VariantID vid, size_t /*tid*/) void HYDRO_2D::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_zzout, m_array_length, checksum_scale_factor ); - checksum[vid] += calcChecksum(m_zrout, m_array_length, checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_zzout, m_array_length, checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_zrout, m_array_length, checksum_scale_factor ); } void HYDRO_2D::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp index a58537eed..b83844442 100644 --- a/src/lcals/INT_PREDICT.cpp +++ b/src/lcals/INT_PREDICT.cpp @@ -79,7 +79,7 @@ void INT_PREDICT::updateChecksum(VariantID vid, size_t tid) m_px[i] -= m_px_initval; } - checksum[vid] += calcChecksum(m_px, getActualProblemSize()); + checksum[vid][tid] += calcChecksum(m_px, getActualProblemSize()); } void INT_PREDICT::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp index 193f7d456..eda01cf2d 100644 --- a/src/lcals/PLANCKIAN.cpp +++ b/src/lcals/PLANCKIAN.cpp @@ -66,7 +66,7 @@ void PLANCKIAN::setUp(VariantID vid, size_t /*tid*/) void PLANCKIAN::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_w, getActualProblemSize()); + checksum[vid][tid] += calcChecksum(m_w, getActualProblemSize()); } void PLANCKIAN::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp index 57302d426..b097049a4 100644 --- a/src/lcals/TRIDIAG_ELIM.cpp +++ b/src/lcals/TRIDIAG_ELIM.cpp @@ -67,7 +67,7 @@ void TRIDIAG_ELIM::setUp(VariantID vid, size_t /*tid*/) void TRIDIAG_ELIM::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_xout, getActualProblemSize()); + checksum[vid][tid] += calcChecksum(m_xout, getActualProblemSize()); } void TRIDIAG_ELIM::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index d9aaa2d09..80d1a9153 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -96,7 +96,7 @@ void POLYBENCH_2MM::setUp(VariantID vid, size_t /*tid*/) void POLYBENCH_2MM::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_D, m_ni * m_nl, checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_D, m_ni * m_nl, checksum_scale_factor ); } void POLYBENCH_2MM::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp index f2ae2577e..dc012e81f 100644 --- a/src/polybench/POLYBENCH_3MM.cpp +++ b/src/polybench/POLYBENCH_3MM.cpp @@ -106,7 +106,7 @@ void POLYBENCH_3MM::setUp(VariantID vid, size_t /*tid*/) void POLYBENCH_3MM::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_G, m_ni * m_nl, checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_G, m_ni * m_nl, checksum_scale_factor ); } void POLYBENCH_3MM::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp index 14802ebf8..226839431 100644 --- a/src/polybench/POLYBENCH_ADI.cpp +++ b/src/polybench/POLYBENCH_ADI.cpp @@ -79,7 +79,7 @@ void POLYBENCH_ADI::setUp(VariantID vid, size_t /*tid*/) void POLYBENCH_ADI::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_U, m_n * m_n, checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_U, m_n * m_n, checksum_scale_factor ); } void POLYBENCH_ADI::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp index c2656a96f..d33c2fc83 100644 --- a/src/polybench/POLYBENCH_ATAX.cpp +++ b/src/polybench/POLYBENCH_ATAX.cpp @@ -82,7 +82,7 @@ void POLYBENCH_ATAX::setUp(VariantID vid, size_t /*tid*/) void POLYBENCH_ATAX::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_y, m_N, checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_y, m_N, checksum_scale_factor ); } void POLYBENCH_ATAX::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp index 29e3d1d66..46c031b6a 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D.cpp @@ -100,7 +100,7 @@ void POLYBENCH_FDTD_2D::setUp(VariantID vid, size_t /*tid*/) void POLYBENCH_FDTD_2D::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_hz, m_nx * m_ny, checksum_scale_factor); + checksum[vid][tid] += calcChecksum(m_hz, m_nx * m_ny, checksum_scale_factor); } void POLYBENCH_FDTD_2D::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp index 08ddd798c..0ff7c02c9 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp @@ -75,7 +75,7 @@ void POLYBENCH_FLOYD_WARSHALL::setUp(VariantID vid, size_t /*tid*/) void POLYBENCH_FLOYD_WARSHALL::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_pout, m_N*m_N, checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_pout, m_N*m_N, checksum_scale_factor ); } void POLYBENCH_FLOYD_WARSHALL::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp index bf89f83ce..7f7d359d2 100644 --- a/src/polybench/POLYBENCH_GEMM.cpp +++ b/src/polybench/POLYBENCH_GEMM.cpp @@ -86,7 +86,7 @@ void POLYBENCH_GEMM::setUp(VariantID vid, size_t /*tid*/) void POLYBENCH_GEMM::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_C, m_ni * m_nj, checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_C, m_ni * m_nj, checksum_scale_factor ); } void POLYBENCH_GEMM::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp index 48eec4ae7..32561df20 100644 --- a/src/polybench/POLYBENCH_GEMVER.cpp +++ b/src/polybench/POLYBENCH_GEMVER.cpp @@ -102,7 +102,7 @@ void POLYBENCH_GEMVER::setUp(VariantID vid, size_t /*tid*/) void POLYBENCH_GEMVER::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_w, m_n, checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_w, m_n, checksum_scale_factor ); } void POLYBENCH_GEMVER::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp index cb0cf3389..ade819de7 100644 --- a/src/polybench/POLYBENCH_GESUMMV.cpp +++ b/src/polybench/POLYBENCH_GESUMMV.cpp @@ -76,7 +76,7 @@ void POLYBENCH_GESUMMV::setUp(VariantID vid, size_t /*tid*/) void POLYBENCH_GESUMMV::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_y, m_N); + checksum[vid][tid] += calcChecksum(m_y, m_N); } void POLYBENCH_GESUMMV::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp index fe603915b..d40de317b 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D.cpp @@ -87,8 +87,8 @@ void POLYBENCH_HEAT_3D::setUp(VariantID vid, size_t /*tid*/) void POLYBENCH_HEAT_3D::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_A, m_N*m_N*m_N, checksum_scale_factor ); - checksum[vid] += calcChecksum(m_B, m_N*m_N*m_N, checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_A, m_N*m_N*m_N, checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_B, m_N*m_N*m_N, checksum_scale_factor ); } void POLYBENCH_HEAT_3D::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp index 102d42cee..5743ca6e7 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp @@ -84,8 +84,8 @@ void POLYBENCH_JACOBI_1D::setUp(VariantID vid, size_t /*tid*/) void POLYBENCH_JACOBI_1D::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_A, m_N, checksum_scale_factor ); - checksum[vid] += calcChecksum(m_B, m_N, checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_A, m_N, checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_B, m_N, checksum_scale_factor ); } void POLYBENCH_JACOBI_1D::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp index a3b924c46..c4c14edb8 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp @@ -86,8 +86,8 @@ void POLYBENCH_JACOBI_2D::setUp(VariantID vid, size_t /*tid*/) void POLYBENCH_JACOBI_2D::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_A, m_N*m_N, checksum_scale_factor ); - checksum[vid] += calcChecksum(m_B, m_N*m_N, checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_A, m_N*m_N, checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_B, m_N*m_N, checksum_scale_factor ); } void POLYBENCH_JACOBI_2D::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp index c27ec078a..498ca6c12 100644 --- a/src/polybench/POLYBENCH_MVT.cpp +++ b/src/polybench/POLYBENCH_MVT.cpp @@ -80,8 +80,8 @@ void POLYBENCH_MVT::setUp(VariantID vid, size_t /*tid*/) void POLYBENCH_MVT::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_x1, m_N, checksum_scale_factor ); - checksum[vid] += calcChecksum(m_x2, m_N, checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_x1, m_N, checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_x2, m_N, checksum_scale_factor ); } void POLYBENCH_MVT::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp index 11dfa7886..4a77a264b 100644 --- a/src/stream/ADD.cpp +++ b/src/stream/ADD.cpp @@ -67,7 +67,7 @@ void ADD::setUp(VariantID vid, size_t /*tid*/) void ADD::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_c, getActualProblemSize()); + checksum[vid][tid] += calcChecksum(m_c, getActualProblemSize()); } void ADD::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp index bb337dc68..f6802ccba 100644 --- a/src/stream/COPY.cpp +++ b/src/stream/COPY.cpp @@ -66,7 +66,7 @@ void COPY::setUp(VariantID vid, size_t /*tid*/) void COPY::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_c, getActualProblemSize()); + checksum[vid][tid] += calcChecksum(m_c, getActualProblemSize()); } void COPY::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp index 7352f7ebf..1b44ab70e 100644 --- a/src/stream/DOT.cpp +++ b/src/stream/DOT.cpp @@ -69,7 +69,7 @@ void DOT::setUp(VariantID vid, size_t /*tid*/) void DOT::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += m_dot; + checksum[vid][tid] += m_dot; } void DOT::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp index 8e216d647..042d3bb8a 100644 --- a/src/stream/MUL.cpp +++ b/src/stream/MUL.cpp @@ -67,7 +67,7 @@ void MUL::setUp(VariantID vid, size_t /*tid*/) void MUL::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_b, getActualProblemSize()); + checksum[vid][tid] += calcChecksum(m_b, getActualProblemSize()); } void MUL::tearDown(VariantID vid, size_t /*tid*/) diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp index da43f1c6a..076104bcd 100644 --- a/src/stream/TRIAD.cpp +++ b/src/stream/TRIAD.cpp @@ -72,7 +72,7 @@ void TRIAD::setUp(VariantID vid, size_t /*tid*/) void TRIAD::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_a, getActualProblemSize(), checksum_scale_factor ); + checksum[vid][tid] += calcChecksum(m_a, getActualProblemSize(), checksum_scale_factor ); } void TRIAD::tearDown(VariantID vid, size_t /*tid*/) From 9a0ed48c8e6bba00f7658102589fd4ccb79c6ad8 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 23 Feb 2022 11:16:31 -0800 Subject: [PATCH 216/392] fixup opt parsing --- src/common/RunParams.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 2cdeafc06..94f76da8d 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -328,7 +328,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) done = true; } else { got_someting = true; - int gpu_block_size = ::atoi( opt ); + int gpu_block_size = ::atoi( opt.c_str() ); if ( gpu_block_size <= 0 ) { std::cout << "\nBad input:" << " must give --gpu_block_size POSITIVE values (int)" From 00e75c87f6a91ad3fc9ec17cefd738d155b98fc2 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 23 Feb 2022 11:16:52 -0800 Subject: [PATCH 217/392] fixup tuning name print --- src/common/KernelBase.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 3afcd8bf5..ccc1bb3a7 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -266,7 +266,7 @@ void KernelBase::print(std::ostream& os) const os << "\t\t\t\t" << getVariantName(static_cast(j)) << " :" << std::endl; for (size_t t = 0; t < variant_tuning_names[j].size(); ++t) { - os << "\t\t\t\t\t" << getTuningName(static_cast(j), t) + os << "\t\t\t\t\t" << getVariantTuningName(static_cast(j), t) << std::endl; } } From 08b6eb907f6ec236779a35c8f8464bee28f0db72 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 23 Feb 2022 11:17:09 -0800 Subject: [PATCH 218/392] fixup COUPLE runKernel --- src/apps/WIP-COUPLE.cpp | 7 ++++--- src/apps/WIP-COUPLE.hpp | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/apps/WIP-COUPLE.cpp b/src/apps/WIP-COUPLE.cpp index ec1958299..32a7151a4 100644 --- a/src/apps/WIP-COUPLE.cpp +++ b/src/apps/WIP-COUPLE.cpp @@ -80,8 +80,9 @@ void COUPLE::setUp(VariantID vid, size_t /*tid*/) m_ireal = Complex_type(0.0, 1.0); } -void COUPLE::runKernel(VariantID vid) +void COUPLE::runKernel(VariantID vid, size_t tid) { + RAJA_UNUSED_VAR(tid); const Index_type run_reps = getRunReps(); COUPLE_DATA_SETUP; @@ -158,7 +159,7 @@ void COUPLE::runKernel(VariantID vid) case Base_OpenMPTarget : case RAJA_OpenMPTarget : { - runOpenMPTargetVariant(vid); + runOpenMPTargetVariant(vid, tid); break; } #endif @@ -167,7 +168,7 @@ void COUPLE::runKernel(VariantID vid) case Base_CUDA : case RAJA_CUDA : { - runCudaVariant(vid); + runCudaVariant(vid, tid); break; } #endif diff --git a/src/apps/WIP-COUPLE.hpp b/src/apps/WIP-COUPLE.hpp index c0aa77963..3caa1bbb6 100644 --- a/src/apps/WIP-COUPLE.hpp +++ b/src/apps/WIP-COUPLE.hpp @@ -162,7 +162,7 @@ class COUPLE : public KernelBase ~COUPLE(); void setUp(VariantID vid, size_t tid); - void runKernel(VariantID vid); + void runKernel(VariantID vid, size_t tid); void updateChecksum(VariantID vid, size_t tid); void tearDown(VariantID vid, size_t tid); From f9e8090f1a1db96f9d27462fde1468f91fc75b79 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 23 Feb 2022 11:28:04 -0800 Subject: [PATCH 219/392] Fixup missing resize in writeChecksumReport --- src/common/Executor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 447af3511..b749ba36b 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -1400,6 +1400,7 @@ void Executor::writeChecksumReport(ostream& file) size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]); checksums[iv].resize(num_tunings, 0.0); + checksums_diff[iv].resize(num_tunings, 0.0); for (size_t tid = 0; tid < num_tunings; ++tid) { if ( kern->wasVariantTuningRun(vid, tid) ) { checksums[iv][tid] = kern->getChecksum(vid, tid); From 8db2466ee36caf862dac22099c25d636bbde9bd6 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 23 Feb 2022 11:33:09 -0800 Subject: [PATCH 220/392] Reduce number of lines printed --- src/common/Executor.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index b749ba36b..ce2f31bd1 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -643,14 +643,11 @@ void Executor::reportRunSummary(ostream& str) const str << "\nThe following kernels and variants (when available for a kernel) will be run:" << endl; - str << "\nVariants" + str << "\nVariants and Tunings" << "\n--------\n"; for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - str << getVariantName(variant_ids[iv]) << endl; - - str << "\n\tTunings\n"; for (std::string const& tuning_name : tuning_names[variant_ids[iv]]) { - str << "\t" << tuning_name << endl; + str << getVariantName(variant_ids[iv]) << "-" << tuning_name<< endl; } } From dc2a77188adfd6a1d86944a01542e8592bdc2a1d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 23 Feb 2022 12:00:43 -0800 Subject: [PATCH 221/392] Simplify GPUUtils for c++14 --- src/common/GPUUtils.hpp | 104 ---------------------------------------- 1 file changed, 104 deletions(-) diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index d4f011bd7..3dadda4b8 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -45,19 +45,6 @@ constexpr size_t lesser_of_squarest_factor_pair_helper(size_t n, size_t guess) : lesser_of_squarest_factor_pair_helper(n, guess - 1); // continue searching } -// helpers to invoke f with each integer in the param pack -template < typename F > -bool invoke_or_helper(F) -{ - return false; -} -/// -template < typename F, size_t I, size_t... Is> -bool invoke_or_helper(F f) -{ - return f(camp::int_seq()) || invoke_or_helper(f); -} - // class to get the size of a camp::int_seq template < typename IntSeq > struct SizeOfIntSeq; @@ -105,7 +92,6 @@ struct remove_invalid> >::type; }; - } // namespace detail // constexpr integer sqrt @@ -131,96 +117,6 @@ constexpr size_t greater_of_squarest_factor_pair(size_t n) : n / detail::lesser_of_squarest_factor_pair_helper(n, sqrt(n)); } -// call f's call operator with each integer as the template param in turn -// stopping at the first integer that returns true. -// return true if any f() returns true, otherwise return false -template < typename F, size_t... Is > -bool invoke_or(F f, camp::int_seq) -{ - return detail::invoke_or_helper(f); -} - -// if the given integer is the same as the template param block_size -// returns true otherwise returns false -struct Equals -{ - Equals(size_t actual_gpu_block_size) - : m_actual_gpu_block_size(actual_gpu_block_size) - {} - - template < size_t block_size > - bool operator()(camp::int_seq) const - { return m_actual_gpu_block_size == block_size; } - -private: - size_t m_actual_gpu_block_size; -}; - -// if the kernel's actual block size is the same as the template param -// runs the cuda variant with the template param block_size and returns true -// otherwise returns false -template < typename Kernel > -struct RunCudaBlockSize -{ - RunCudaBlockSize(Kernel& kernel, VariantID vid) - : m_kernel(kernel), m_vid(vid) - {} - - template < size_t block_size > - bool operator()(camp::int_seq) const - { - if (block_size == m_kernel.getActualGPUBlockSize()) { - m_kernel.template runCudaVariantImpl(m_vid); - return true; - } - return false; - } - -private: - Kernel& m_kernel; - VariantID m_vid; -}; - -// if the kernel's actual block size is the same as the template param -// runs the hip variant with the template param block_size and returns true -// otherwise returns false -template < typename Kernel > -struct RunHipBlockSize -{ - RunHipBlockSize(Kernel& kernel, VariantID vid) - : m_kernel(kernel), m_vid(vid) - {} - - template < size_t block_size > - bool operator()(camp::int_seq) const - { - if (block_size == m_kernel.getActualGPUBlockSize()) { - m_kernel.template runHipVariantImpl(m_vid); - return true; - } - return false; - } - -private: - Kernel& m_kernel; - VariantID m_vid; -}; - -// return default_I if it is in sizes or the first integer in sizes otherwise -template < size_t I, size_t... Is > -inline size_t get_default_or_first(size_t default_I, camp::int_seq sizes) -{ - if (invoke_or(Equals(default_I), sizes)) { - return default_I; - } - return I; -} -/// base case when sizes is empty -inline size_t get_default_or_first(size_t, camp::int_seq) -{ - return 0; -} - // always true struct AllowAny { From 53cb65c654d3c6fb522997e63f3fa80985704bd9 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 23 Feb 2022 12:35:53 -0800 Subject: [PATCH 222/392] Fix speedup output --- src/common/Executor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index ce2f31bd1..527c52aa2 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -992,7 +992,7 @@ void Executor::writeCSVReport(ostream& file, CSVRepMode mode, std::string const& tuning_name = tuning_names[variant_ids[iv]][it]; file << sepchr <hasVariantTuningDefined(reference_vid, tuning_name) || + (!kern->hasVariantTuningDefined(reference_vid, reference_tid) || !kern->hasVariantTuningDefined(vid, tuning_name)) ) { file << "Not run"; } else if ( (mode == CSVRepMode::Timing) && From 684ab79708bf1a2d3a4566a1b01165382059bea0 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 23 Feb 2022 13:13:21 -0800 Subject: [PATCH 223/392] Set reference tid --- src/common/Executor.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 527c52aa2..496360037 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -449,6 +449,7 @@ void Executor::setupSuite() run_var.insert( vid ); if ( getVariantName(vid) == run_params.getReferenceVariant() ) { reference_vid = vid; + reference_tid = 0; } } } @@ -458,6 +459,7 @@ void Executor::setupSuite() // if ( run_params.getReferenceVariant().empty() && !run_var.empty() ) { reference_vid = *run_var.begin(); + reference_tid = 0; } } else { @@ -485,6 +487,7 @@ void Executor::setupSuite() run_var.insert(vid); if ( getVariantName(vid) == run_params.getReferenceVariant() ) { reference_vid = vid; + reference_tid = 0; } } found_it = true; @@ -499,6 +502,7 @@ void Executor::setupSuite() // if ( run_params.getReferenceVariant().empty() && !run_var.empty() ) { reference_vid = *run_var.begin(); + reference_tid = 0; } run_params.setInvalidVariantInput(invalid); From f18fbd1a26460d373754ee0abea2dca3deb76daf Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 23 Feb 2022 13:14:11 -0800 Subject: [PATCH 224/392] Define static unknown tid --- src/common/Executor.cpp | 2 +- src/common/KernelBase.cpp | 4 ++-- src/common/KernelBase.hpp | 6 ++++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 496360037..fa5adf5e5 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -43,7 +43,7 @@ using namespace std; Executor::Executor(int argc, char** argv) : run_params(argc, argv), reference_vid(NumVariants), - reference_tid(std::numeric_limits::max()) + reference_tid(KernelBase::getUnknownTuningIdx()) { } diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index ccc1bb3a7..a64604e39 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -36,7 +36,7 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params) : FLOPs_per_rep = -1; running_variant = NumVariants; - running_tuning = std::numeric_limits::max(); + running_tuning = getUnknownTuningIdx(); checksum_scale_factor = 1.0; } @@ -162,7 +162,7 @@ void KernelBase::execute(VariantID vid, size_t tid) this->tearDown(vid, tid); running_variant = NumVariants; - running_tuning = std::numeric_limits::max(); + running_tuning = getUnknownTuningIdx(); } void KernelBase::recordExecTime() diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 5465c5aa5..0226c6ebb 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -43,6 +43,8 @@ namespace rajaperf { class KernelBase { public: + static constexpr size_t getUnknownTuningIdx() + { return std::numeric_limits::max(); } KernelBase(KernelID kid, const RunParams& params); virtual ~KernelBase(); @@ -129,7 +131,7 @@ class KernelBase for (std::string const& a_tuning_name : getVariantTuningNames(vid)) { if (tuning_name == a_tuning_name) { return t; } } - return std::numeric_limits::max(); + return getUnknownTuningIdx(); } size_t getNumVariantTunings(VariantID vid) const { return variant_tuning_names[vid].size(); } @@ -144,7 +146,7 @@ class KernelBase // bool wasVariantTuningRun(VariantID vid, size_t tid) const { - if (tid != std::numeric_limits::max()) { + if (tid != getUnknownTuningIdx()) { return num_exec[vid].at(tid) > 0; } return false; From a2360247a0e7b321a59e6861692b66a515ffbe69 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 23 Feb 2022 13:14:32 -0800 Subject: [PATCH 225/392] Always put default tuning first --- src/common/Executor.cpp | 8 ++++++++ src/common/KernelBase.hpp | 12 +++++++----- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index fa5adf5e5..221952596 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -572,6 +572,14 @@ void Executor::setupSuite() for (auto const& tuning_name_idx_pair : tuning_names_order_map) { tuning_names[vid][tuning_name_idx_pair.second] = tuning_name_idx_pair.first; } + // reorder to put "default" first + auto default_order_iter = tuning_names_order_map.find(KernelBase::getDefaultTuningName()); + if (default_order_iter != tuning_names_order_map.end()) { + size_t default_idx = default_order_iter->second; + std::string default_name = std::move(tuning_names[vid][default_idx]); + tuning_names[vid].erase(tuning_names[vid].begin()+default_idx); + tuning_names[vid].emplace(tuning_names[vid].begin(), std::move(default_name)); + } } // diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 0226c6ebb..d23d1a6fb 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -45,6 +45,8 @@ class KernelBase public: static constexpr size_t getUnknownTuningIdx() { return std::numeric_limits::max(); } + static std::string getDefaultTuningName() { return "default"; } + KernelBase(KernelID kid, const RunParams& params); virtual ~KernelBase(); @@ -71,22 +73,22 @@ class KernelBase { variant_tuning_names[vid].emplace_back(std::move(name)); } virtual void setSeqTuningDefinitions(VariantID vid) - { addVariantTuningName(vid, "default"); } + { addVariantTuningName(vid, getDefaultTuningName()); } #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) virtual void setOpenMPTuningDefinitions(VariantID vid) - { addVariantTuningName(vid, "default"); } + { addVariantTuningName(vid, getDefaultTuningName()); } #endif #if defined(RAJA_ENABLE_CUDA) virtual void setCudaTuningDefinitions(VariantID vid) - { addVariantTuningName(vid, "default"); } + { addVariantTuningName(vid, getDefaultTuningName()); } #endif #if defined(RAJA_ENABLE_HIP) virtual void setHipTuningDefinitions(VariantID vid) - { addVariantTuningName(vid, "default"); } + { addVariantTuningName(vid, getDefaultTuningName()); } #endif #if defined(RAJA_ENABLE_TARGET_OPENMP) virtual void setOpenMPTargetTuningDefinitions(VariantID vid) - { addVariantTuningName(vid, "default"); } + { addVariantTuningName(vid, getDefaultTuningName()); } #endif // From f25b4d227dd4a8cab15cb5dea6a24c8823563fc6 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 23 Feb 2022 13:41:22 -0800 Subject: [PATCH 226/392] Fix timing output The timing output was printing the same times for different tunings. Fix getVariantTuningIndex to return the correct index instead of always returning 0. --- src/common/KernelBase.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index d23d1a6fb..d751366d4 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -129,16 +129,17 @@ class KernelBase } size_t getVariantTuningIndex(VariantID vid, std::string const& tuning_name) const { - size_t t = 0; - for (std::string const& a_tuning_name : getVariantTuningNames(vid)) { + std::vector const& tuning_names = getVariantTuningNames(vid); + for (size_t t = 0; t < tuning_names.size(); ++t) { + std::string const& a_tuning_name = tuning_names[t]; if (tuning_name == a_tuning_name) { return t; } } return getUnknownTuningIdx(); } size_t getNumVariantTunings(VariantID vid) const - { return variant_tuning_names[vid].size(); } + { return getVariantTuningNames(vid).size(); } std::string const& getVariantTuningName(VariantID vid, size_t tid) const - { return variant_tuning_names[vid].at(tid); } + { return getVariantTuningNames(vid).at(tid); } std::vector const& getVariantTuningNames(VariantID vid) const { return variant_tuning_names[vid]; } From 97ce953cd856624181d9a50d6fbad224903d0941 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 23 Feb 2022 14:34:56 -0800 Subject: [PATCH 227/392] Update RAJA and CI stuff --- .gitlab-ci.yml | 55 ++++++++++---- .gitlab/corona-jobs.yml | 27 ++++--- .gitlab/corona-templates.yml | 39 +++------- .gitlab/lassen-jobs.yml | 62 +++++++-------- .gitlab/lassen-templates.yml | 15 ++-- .gitlab/ruby-jobs.yml | 31 +++++--- .gitlab/ruby-templates.yml | 21 +++--- Dockerfile | 141 ++++++++++++++++++++--------------- azure-pipelines.yml | 112 +++++++++++++++++----------- tpl/RAJA | 2 +- 10 files changed, 289 insertions(+), 216 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a03a04416..77393bc9a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,4 +1,4 @@ -############################################################################## +############################################################################### # Copyright (c) 2016-2020, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJA/COPYRIGHT file for details. # @@ -10,23 +10,23 @@ # at Lawrence Livermore National Laboratory (LLNL). # # This entire pipeline is LLNL-specific -############################################################################## +# ############################################################################# # We define the following GitLab pipeline variables: # # GIT_SUBMODULE_STRATEGY: # Tells Gitlab to recursively update the submodules when cloning umpire - +# # ALLOC_NAME: # On LLNL's ruby, this pipeline creates only one allocation shared among jobs # in order to save time and resources. This allocation has to be uniquely named # so that we are sure to retrieve it. - +# # BUILD_ROOT: # The path to the shared resources between all jobs. The BUILD_ROOT is unique to # the pipeline, preventing any form of concurrency with other pipelines. This # also means that the BUILD_ROOT directory will never be cleaned. - +# # DEFAULT_TIME: # Default time to let the Lassen jobs run will be 30 minutes. However, if it is # a job that requires more time, it will be overwritten in the lassen template @@ -38,6 +38,7 @@ variables: ALLOC_NAME: ${CI_PROJECT_NAME}_ci_${CI_PIPELINE_ID} BUILD_ROOT: ${CI_PROJECT_DIR} DEFAULT_TIME: 30 + MP_BRANCH: "develop" # Normally, stages are blocking in Gitlab. However, using the keyword "needs" we # can express dependencies between job that break the ordering of stages, in @@ -49,9 +50,9 @@ stages: - r_build_and_test - r_release_resources - l_build_and_test - - c_allocate_resources - - c_build_and_test - - c_release_resources + - b_build_and_test +# - c_build_and_test + - multi_project # This is the rules that drives the activation of "advanced" jobs. All advanced # jobs will share this through a template mechanism. @@ -68,25 +69,53 @@ stages: - export JOBID=$(squeue -h --name=${ALLOC_NAME} --format=%A) - echo ${JOBID} - srun $( [[ -n "${JOBID}" ]] && echo "--jobid=${JOBID}" ) -t ${DEFAULT_TIME} -N 1 scripts/gitlab/build_and_test.sh + artifacts: + reports: + junit: junit.xml -.build_toss_3_x86_64_ib_corona_script: - script: - - srun -p mi60 -t 30 -N 1 scripts/gitlab/build_and_test.sh +#.build_toss_3_x86_64_ib_corona_script: +# script: +# - srun -p mi60 -t 30 -N 1 scripts/gitlab/build_and_test.sh # Lassen and Butte use a different job scheduler (spectrum lsf) that does not # allow pre-allocation the same way slurm does. .build_blueos_3_ppc64le_ib_script: script: - lalloc 1 -W ${DEFAULT_TIME} scripts/gitlab/build_and_test.sh + artifacts: + reports: + junit: junit.xml + +.build_blueos_3_ppc64le_ib_ats_disabled_script: + script: + - lalloc 1 --atsdisable -W ${DEFAULT_TIME} scripts/gitlab/build_and_test.sh + artifacts: + reports: + junit: junit.xml .build_blueos_3_ppc64le_ib_p9_script: extends: .build_blueos_3_ppc64le_ib_script +# If testing develop branch, trigger CHAI pipeline with this version of RAJA. +# TODO: Once spack allows to clone a specific commit on demand, then point to the exact commit. +# This will prevent from sticking to a branch (here develop). +# MP_BRANCH is short for "Multi-Project Branch" and will usually be develop. +trigger-rajaperf: + stage: multi_project + rules: + - if: '$CI_COMMIT_BRANCH == "${MP_BRANCH}" || $MULTI_PROJECT == "ON"' #run only if ... + variables: + UPDATE_RAJA: ${MP_BRANCH} + trigger: + project: radiuss/rajaperf + branch: develop + strategy: depend + # This is where jobs are included. include: - local: .gitlab/ruby-templates.yml - local: .gitlab/ruby-jobs.yml - local: .gitlab/lassen-templates.yml - local: .gitlab/lassen-jobs.yml - - local: .gitlab/corona-templates.yml - - local: .gitlab/corona-jobs.yml +# - local: .gitlab/corona-templates.yml +# - local: .gitlab/corona-jobs.yml diff --git a/.gitlab/corona-jobs.yml b/.gitlab/corona-jobs.yml index 17b5b2348..2a60c9c64 100644 --- a/.gitlab/corona-jobs.yml +++ b/.gitlab/corona-jobs.yml @@ -1,21 +1,26 @@ ############################################################################# -## Copyright (c) 2016-21, Lawrence Livermore National Security, LLC -## and RAJA project contributors. See the RAJA/COPYRIGHT file for details. -## -## SPDX-License-Identifier: (BSD-3-Clause) -############################################################################## +# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################# + +hip_4_1_gcc_8_1_0 (build and test on corona): + variables: + SPEC: "+rocm~openmp amdgpu_target=gfx906 %gcc@8.1.0 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000' ^hip@4.1.0" + extends: .build_and_test_on_corona -hip_4_1_gcc_8_1_0: +hip_4_2_gcc_8_1_0 (build and test on corona): variables: - SPEC: "+hip~openmp %gcc@8.1.0 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000' ^hip@4.1.0" + SPEC: "+rocm~openmp amdgpu_target=gfx906 %gcc@8.1.0 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000' ^hip@4.2.0" extends: .build_and_test_on_corona -hip_4_1_clang_9_0_0: +hip_4_1_clang_9_0_0 (build and test on corona): variables: - SPEC: "+hip~openmp %clang@9.0.0 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 ^hip@4.1.0" + SPEC: "+rocm~openmp amdgpu_target=gfx906 %clang@9.0.0 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 ^hip@4.1.0" extends: .build_and_test_on_corona -hip_4_2_gcc_8_1_0: +hip_4_1_gcc_8_1_0_desul_atomics (build and test on corona): variables: - SPEC: "+hip~openmp %gcc@8.1.0 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000' ^hip@4.2.0" + SPEC: "+rocm~openmp +desul amdgpu_target=gfx906 %gcc@8.1.0 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000' ^hip@4.1.0" extends: .build_and_test_on_corona diff --git a/.gitlab/corona-templates.yml b/.gitlab/corona-templates.yml index abcc06fbf..4c4ce2883 100644 --- a/.gitlab/corona-templates.yml +++ b/.gitlab/corona-templates.yml @@ -1,11 +1,11 @@ ############################################################################# -## Copyright (c) 2016-21, Lawrence Livermore National Security, LLC -## and RAJA project contributors. See the RAJA/COPYRIGHT file for details. -## -## SPDX-License-Identifier: (BSD-3-Clause) -############################################################################## +# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################# -### +#### # This is the share configuration of jobs for corona #### @@ -15,34 +15,13 @@ - shell - corona rules: - - if: '$CI_COMMIT_BRANCH =~ /_cnone/ || $ON_CORONA == "OFF"' #run except if ... + - if: '$ON_CORONA == "OFF"' #run except if ... when: never - if: '$CI_JOB_NAME =~ /release_resources/' when: always - when: on_success -### -## In pre-build phase, allocate a node for builds -## NOTE: Not specifying 'salloc -c 56' should allocate the max number of CPU cores -allocate_resources (on corona): - variables: - GIT_STRATEGY: none - extends: .on_corona - stage: c_allocate_resources - script: - - salloc -N 1 -pmi60 -t 45 --no-shell --job-name=${ALLOC_NAME} - -### -# In post-build phase, deallocate resources -# Note : make sure this is run even on build phase failure -release_resources (on corona): - variables: - GIT_STRATEGY: none - extends: .on_corona - stage: c_release_resources - script: - - export JOBID=$(squeue -h --name=${ALLOC_NAME} --format=%A) - - ([[ -n "${JOBID}" ]] && scancel ${JOBID}) +#### # Generic corona build job, extending build script .build_and_test_on_corona: stage: c_build_and_test @@ -51,4 +30,4 @@ release_resources (on corona): .build_and_test_on_corona_advanced: extends: [.build_and_test_on_corona, .advanced_pipeline] - stage: c_build_and_test + diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index 8554ff1ad..ccdf55c85 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -1,22 +1,22 @@ ############################################################################## -## Copyright (c) 2016-21, Lawrence Livermore National Security, LLC -## and RAJA project contributors. See the RAJA/COPYRIGHT file for details. -## -## SPDX-License-Identifier: (BSD-3-Clause) -############################################################################### +# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################## -########### -## CPU ONLY -########### +########## +# CPU ONLY +########## -clang_11_0_0: +ibm_clang_9: variables: - SPEC: "%clang@11.0.0" + SPEC: "%clang@ibm.9.0.0" extends: .build_and_test_on_lassen -clang_11_gcc_8: +ibm_clang_9_gcc_8: variables: - SPEC: "%clang@11.0.0 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" + SPEC: "%clang@ibm.9.0.0 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" extends: .build_and_test_on_lassen gcc_8_3_1: @@ -37,17 +37,17 @@ xl_16_1_1_7_gcc_8_3_1: extends: .build_and_test_on_lassen ########## -## CUDA -########### +# CUDA +########## -clang_11_cuda: +ibm_clang_9_cuda: variables: - SPEC: "+cuda cuda_arch=70 %clang@11.0.0 ^cuda@10.1.168" + SPEC: "+cuda cuda_arch=70 %clang@ibm.9.0.0 ^cuda@10.1.168" extends: .build_and_test_on_lassen -clang_11_gcc_8_cuda: +ibm_clang_10_cuda: variables: - SPEC: "+cuda %clang@11.0.0 cuda_arch=70 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@10.1.168" + SPEC: "+cuda cuda_arch=70 %clang@ibm.10.0.1 ^cuda@10.1.168" extends: .build_and_test_on_lassen gcc_8_3_1_cuda: @@ -55,6 +55,11 @@ gcc_8_3_1_cuda: SPEC: "+cuda %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168" extends: .build_and_test_on_lassen +gcc_8_3_1_cuda_ats_disabled: + variables: + SPEC: "+cuda %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168" + extends: .build_and_test_on_lassen_ats_disabled + xl_16_1_1_7_cuda: variables: SPEC: "+cuda %xl@16.1.1.7 cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5" @@ -70,26 +75,21 @@ xl_16_1_1_7_gcc_8_3_1_cuda_11: extends: .build_and_test_on_lassen ########## -## EXTRAS +# EXTRAS ########## -xl_16_1_1_7_omp_target (build and test on lassen): - variables: - SPEC: "%xl@16.1.1.7+openmp+openmp_target ^cmake@3.14.5" - extends: .build_and_test_on_lassen - -clang_11_0_0_omp_target (build and test on lassen): +clang_9_0_0_libcpp (build and test on lassen): variables: - SPEC: "%clang@11.0.0+openmp+openmp_target ^cmake@3.14.5" + SPEC: "%clang@9.0.0+libcpp" extends: .build_and_test_on_lassen -clang_11_0_0_libcpp (build and test on lassen): +clang_9_0_0_memleak (build and test on lassen): variables: - SPEC: "%clang@11.0.0+libcpp" + SPEC: "%clang@9.0.0 cxxflags=-fsanitize=address" + ASAN_OPTIONS: "detect_leaks=1" extends: .build_and_test_on_lassen -clang_11_0_0_memleak (build and test on lassen): +gcc_8_3_1_cuda_desul_atomics: variables: - SPEC: "%clang@11.0.0 cxxflags=-fsanitize=address" - ASAN_OPTIONS: "detect_leaks=1" + SPEC: "+cuda +desul %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168" extends: .build_and_test_on_lassen diff --git a/.gitlab/lassen-templates.yml b/.gitlab/lassen-templates.yml index aa3027b48..dbc340f22 100644 --- a/.gitlab/lassen-templates.yml +++ b/.gitlab/lassen-templates.yml @@ -1,9 +1,9 @@ ############################################################################## -## Copyright (c) 2016-21, Lawrence Livermore National Security, LLC -## and RAJA project contributors. See the RAJA/COPYRIGHT file for details. -## -## SPDX-License-Identifier: (BSD-3-Clause) -############################################################################### +# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################## #### # Shared configuration of jobs for lassen @@ -22,6 +22,11 @@ extends: [.build_blueos_3_ppc64le_ib_p9_script, .on_lassen] needs: [] +.build_and_test_on_lassen_ats_disabled: + stage: l_build_and_test + extends: [.build_blueos_3_ppc64le_ib_ats_disabled_script, .on_lassen] + needs: [] + # Note: .build_and_test_on_lassen_advanced inherits from # .build_and_test_on_lassen and .advanced_pileline. # In particular, the rules section will be merged. Careful when changing rules. diff --git a/.gitlab/ruby-jobs.yml b/.gitlab/ruby-jobs.yml index d8e8e95c8..2b6cceb5c 100644 --- a/.gitlab/ruby-jobs.yml +++ b/.gitlab/ruby-jobs.yml @@ -1,9 +1,9 @@ ############################################################################## -## Copyright (c) 2016-21, Lawrence Livermore National Security, LLC -## and RAJA project contributors. See the RAJA/COPYRIGHT file for details. -## -## SPDX-License-Identifier: (BSD-3-Clause) -############################################################################### +# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################## clang_10: variables: @@ -21,9 +21,15 @@ gcc_8_1_0: DEFAULT_TIME: 60 extends: .build_and_test_on_ruby -icpc_17_0_2: +#icpc_17_0_2: +# variables: +# SPEC: "%intel@17.0.2" +# DEFAULT_TIME: 40 +# extends: .build_and_test_on_ruby + +icpc_18_0_2: variables: - SPEC: "%intel@17.0.2" + SPEC: " tests=none %intel@18.0.2" DEFAULT_TIME: 40 extends: .build_and_test_on_ruby @@ -35,8 +41,13 @@ icpc_19_1_0: # EXTRAS -gcc_4_9_3: +#gcc_4_9_3: +# variables: +# SPEC: "%gcc@4.9.3" +# DEFAULT_TIME: 60 +# extends: .build_and_test_on_ruby + +clang_10_desul_atomics: variables: - SPEC: "%gcc@4.9.3" - DEFAULT_TIME: 60 + SPEC: "+openmp +desul %clang@10.0.1 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" extends: .build_and_test_on_ruby diff --git a/.gitlab/ruby-templates.yml b/.gitlab/ruby-templates.yml index cdbcd60e8..b1314534b 100644 --- a/.gitlab/ruby-templates.yml +++ b/.gitlab/ruby-templates.yml @@ -1,9 +1,9 @@ ############################################################################## -## Copyright (c) 2016-21, Lawrence Livermore National Security, LLC -## and RAJA project contributors. See the RAJA/COPYRIGHT file for details. -## -## SPDX-License-Identifier: (BSD-3-Clause) -############################################################################### +# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################## #### # This is the shared configuration of jobs for ruby @@ -15,13 +15,13 @@ - shell - ruby rules: - - if: '$CI_COMMIT_BRANCH =~ /_rnone/ || $ON_RUBY == "OFF"' #run except if ... + - if: '$CI_COMMIT_BRANCH =~ /_qnone/ || $ON_RUBY == "OFF"' #run except if ... when: never - if: '$CI_JOB_NAME =~ /release_resources/' when: always - when: on_success -### +#### # In pre-build phase, allocate a node for builds # NOTE: Not specifying 'salloc -c 56' should allocate the max number of CPU cores allocate_resources (on ruby): @@ -32,9 +32,9 @@ allocate_resources (on ruby): script: - salloc -N 1 -p pdebug -t 45 --no-shell --job-name=${ALLOC_NAME} -### +#### # In post-build phase, deallocate resources -# Note : make sure this is run even on build phase failure (see "rules:" in ".on_ruby:"). +# Note : make sure this is run even on build phase failure release_resources (on ruby): variables: GIT_STRATEGY: none @@ -44,7 +44,7 @@ release_resources (on ruby): - export JOBID=$(squeue -h --name=${ALLOC_NAME} --format=%A) - ([[ -n "${JOBID}" ]] && scancel ${JOBID}) -### +#### # Generic ruby build job, extending build script .build_and_test_on_ruby: extends: [.build_toss_3_x86_64_ib_script, .on_ruby] @@ -52,4 +52,3 @@ release_resources (on ruby): .build_and_test_on_ruby_advanced: extends: [.build_and_test_on_ruby, .advanced_pipeline] - stage: r_build_and_test diff --git a/Dockerfile b/Dockerfile index 4dbf8e95c..0e03f9694 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,87 +1,104 @@ ############################################################################### -# Copyright (c) 2016-21, Lawrence Livermore National Security, LLC -# and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -FROM axom/compilers:gcc-5 AS gcc5 +FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-7.3.0 AS gcc7 ENV GTEST_COLOR=1 -COPY --chown=axom:axom . /home/axom/workspace -WORKDIR /home/axom/workspace -RUN ls -RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_WARNINGS=On -DENABLE_OPENMP=On -DRAJA_DEPRECATED_TESTS=On .. -RUN cd build && make -j 16 -RUN cd build && ./bin/raja-perf.exe -sp +COPY . /home/raja/workspace +WORKDIR /home/raja/workspace/build +RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_TBB=On -DRAJA_DEPRECATED_TESTS=On .. && \ + make -j 6 &&\ + ctest -T test --output-on-failure -FROM axom/compilers:gcc-6 AS gcc6 +FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-8.1.0 AS gcc8 ENV GTEST_COLOR=1 -COPY --chown=axom:axom . /home/axom/workspace -WORKDIR /home/axom/workspace -RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_WARNINGS=On -DENABLE_OPENMP=On -DRAJA_ENABLE_RUNTIME_PLUGINS=On .. -RUN cd build && make -j 16 -RUN cd build && ./bin/raja-perf.exe -sp +COPY . /home/raja/workspace +WORKDIR /home/raja/workspace/build +RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_COVERAGE=On -DRAJA_ENABLE_TBB=On .. && \ + make -j 6 &&\ + ctest -T test --output-on-failure -FROM axom/compilers:gcc-7 AS gcc7 +FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-9.4.0 AS gcc9 ENV GTEST_COLOR=1 -COPY --chown=axom:axom . /home/axom/workspace -WORKDIR /home/axom/workspace -RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_WARNINGS=On -DENABLE_OPENMP=On .. -RUN cd build && make -j 16 -RUN cd build && ./bin/raja-perf.exe -sp +COPY . /home/raja/workspace +WORKDIR /home/raja/workspace/build +RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_TBB=On -DRAJA_ENABLE_RUNTIME_PLUGINS=On .. && \ + make -j 6 &&\ + ctest -T test --output-on-failure -FROM axom/compilers:gcc-8 AS gcc8 +FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-11.2.0 AS gcc11 ENV GTEST_COLOR=1 -COPY --chown=axom:axom . /home/axom/workspace -WORKDIR /home/axom/workspace -RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_WARNINGS=On -DENABLE_OPENMP=On -DRAJA_ENABLE_BOUNDS_CHECK=ON .. -RUN cd build && make -j 16 -RUN cd build && ./bin/raja-perf.exe -sp +COPY . /home/raja/workspace +WORKDIR /home/raja/workspace/build +RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_TBB=On -DRAJA_ENABLE_BOUNDS_CHECK=ON .. && \ + make -j 6 &&\ + ctest -T test --output-on-failure -FROM axom/compilers:clang-9 AS clang9 +FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11 ENV GTEST_COLOR=1 -COPY --chown=axom:axom . /home/axom/workspace -WORKDIR /home/axom/workspace -RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_FLAGS=-fmodules -DENABLE_OPENMP=On .. -RUN cd build && make -j 16 -RUN cd build && ./bin/raja-perf.exe -sp +COPY . /home/raja/workspace +WORKDIR /home/raja/workspace/build +RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DRAJA_ENABLE_TBB=On .. && \ + make -j 6 &&\ + ctest -T test --output-on-failure -FROM axom/compilers:clang-9 AS clang9-debug +FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11-debug ENV GTEST_COLOR=1 -COPY --chown=axom:axom . /home/axom/workspace -WORKDIR /home/axom/workspace -RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug -DENABLE_OPENMP=On -DCMAKE_CXX_FLAGS=-fsanitize=address .. -RUN cd build && make -j 16 -RUN cd build && ./bin/raja-perf.exe --checkrun -sp +COPY . /home/raja/workspace +WORKDIR /home/raja/workspace/build +RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug .. && \ + make -j 6 &&\ + ctest -T test --output-on-failure -FROM axom/compilers:nvcc-10.2 AS nvcc10 +FROM ghcr.io/rse-ops/clang-ubuntu-22.04:llvm-13.0.0 AS clang13 ENV GTEST_COLOR=1 -COPY --chown=axom:axom . /home/axom/workspace -WORKDIR /home/axom/workspace -RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 .. -RUN cd build && make -j 2 -RUN cd build && ./bin/raja-perf.exe --dryrun +COPY . /home/raja/workspace +WORKDIR /home/raja/workspace/build +RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release .. && \ + make -j 6 &&\ + ctest -T test --output-on-failure -FROM axom/compilers:nvcc-10.2 AS nvcc10-debug +FROM ghcr.io/rse-ops/cuda:cuda-10.1.243-ubuntu-18.04 AS nvcc10 ENV GTEST_COLOR=1 -COPY --chown=axom:axom . /home/axom/workspace -WORKDIR /home/axom/workspace -RUN mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=Debug -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 .. -RUN cd build && make -j 2 -RUN cd build && ./bin/raja-perf.exe --dryrun +COPY . /home/raja/workspace +WORKDIR /home/raja/workspace/build +RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \ + cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 .. && \ + make -j 4 -FROM axom/compilers:rocm AS hip +FROM ghcr.io/rse-ops/cuda-ubuntu-20.04:cuda-11.1.1 AS nvcc11 +ENV GTEST_COLOR=1 +COPY . /home/raja/workspace +WORKDIR /home/raja/workspace/build +RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \ + cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 .. && \ + make -j 4 + +FROM ghcr.io/rse-ops/cuda-ubuntu-20.04:cuda-11.1.1 AS nvcc11-debug +ENV GTEST_COLOR=1 +COPY . /home/raja/workspace +WORKDIR /home/raja/workspace/build +RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \ + cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 .. && \ + make -j 4 + +FROM ghcr.io/rse-ops/hip-ubuntu-20.04:hip-4.3.1 AS hip ENV GTEST_COLOR=1 -COPY --chown=axom:axom . /home/axom/workspace -WORKDIR /home/axom/workspace ENV HCC_AMDGPU_TARGET=gfx900 -RUN mkdir build && cd build && cmake -DROCM_ROOT_DIR=/opt/rocm/include -DHIP_RUNTIME_INCLUDE_DIRS="/opt/rocm/include;/opt/rocm/hip/include" -DENABLE_HIP=On -DENABLE_OPENMP=Off -DENABLE_CUDA=Off -DENABLE_WARNINGS_AS_ERRORS=Off -DHIP_HIPCC_FLAGS=-fPIC .. -RUN cd build && make -j 16 -RUN cd build && ./bin/raja-perf.exe --dryrun +COPY . /home/raja/workspace +WORKDIR /home/raja/workspace/build +RUN . /opt/spack/share/spack/setup-env.sh && spack load hip llvm-amdgpu && \ + cmake -DCMAKE_CXX_COMPILER=amdclang++ -DRAJA_ENABLE_EXTERNAL_ROCPRIM=Off -DHIP_PATH=/opt -DENABLE_HIP=On -DENABLE_CUDA=Off -DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off .. && \ + make -j 6 -FROM axom/compilers:oneapi AS sycl +FROM ghcr.io/rse-ops/intel-ubuntu-22.04:intel-2022.0.1 AS sycl ENV GTEST_COLOR=1 -COPY --chown=axom:axom . /home/axom/workspace -WORKDIR /home/axom/workspace -RUN /bin/bash -c "source /opt/intel/inteloneapi/setvars.sh && mkdir build && cd build && cmake -DCMAKE_CXX_COMPILER=dpcpp -DENABLE_SYCL=On .." -RUN /bin/bash -c "source /opt/intel/inteloneapi/setvars.sh && cd build && make -j 16" +COPY . /home/raja/workspace +WORKDIR /home/raja/workspace/build +RUN /bin/bash -c "source /opt/view/setvars.sh && \ + cmake -DCMAKE_CXX_COMPILER=dpcpp -DRAJA_ENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 .. && \ + make -j 6 &&\ + ctest -T test --output-on-failure" diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 9208fafa2..6b40fa89a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -5,58 +5,71 @@ variables: COMPILER: 'g++' jobs: -#- job: Windows #Commenting out until windows builds are supported. -# strategy: -# matrix: -# shared: -# SHARED_ARGS: '-DBUILD_SHARED_LIBS=On -DCMAKE_CXX_FLAGS="/DRAJASHAREDDLL_EXPORTS" ' -# static: -# SHARED_ARGS: '-DBUILD_SHARED_LIBS=Off' -# pool: -# vmImage: 'windows-2019' -# variables: -# CMAKE_EXTRA_FLAGS: '-DENABLE_WARNINGS_AS_ERRORS=Off -DBLT_CXX_STD="" -DCMAKE_CXX_STANDARD=17' -# steps: -# - checkout: self -# clean: boolean -# submodules: recursive -# - task: CMake@1 -# inputs: -# workingDir: 'build' -# cmakeArgs: '$(CMAKE_EXTRA_FLAGS) $(SHARED_ARGS) ../' -# - task: CMake@1 -# inputs: -# workingDir: 'build' -# cmakeArgs: '--build . --config Release --verbose' +- job: Windows #temporarily commenting out until cmake/azure version issue resolved + strategy: + matrix: + shared: + SHARED_ARGS: '-DBUILD_SHARED_LIBS=On -DCMAKE_CXX_FLAGS="/DRAJASHAREDDLL_EXPORTS" ' + static: + SHARED_ARGS: '-DBUILD_SHARED_LIBS=Off' + pool: + vmImage: 'windows-2019' + variables: + CMAKE_EXTRA_FLAGS: '-DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off -DBLT_CXX_STD="" -DCMAKE_CXX_STANDARD=17' + steps: + - checkout: self + clean: boolean + submodules: recursive + - task: CMake@1 + inputs: + workingDir: 'build' + cmakeArgs: '$(CMAKE_EXTRA_FLAGS) $(SHARED_ARGS) ../' + - task: CMake@1 + inputs: + workingDir: 'build' + cmakeArgs: '--build . --config Release --verbose -j 4' +# - task: CmdLine@2 +# inputs: +# script: 'ctest.exe -T test -C Release' +# workingDirectory: 'build' +# condition: eq( variables['Agent.OS'], 'Windows_NT') +# - task: PublishTestResults@2 +# inputs: +# testResultsFormat: 'cTest' +# testResultsFiles: '**/Test.xml' - job: Docker timeoutInMinutes: 360 strategy: matrix: - gcc5: - docker_target: gcc5 - gcc6: - docker_target: gcc6 gcc7: docker_target: gcc7 gcc8: docker_target: gcc8 - clang9: - docker_target: clang9 - clang9-debug: - docker_target: clang9-debug + gcc9: + docker_target: gcc9 + gcc11: + docker_target: gcc11 + clang11: + docker_target: clang11 + clang11-debug: + docker_target: clang11-debug + clang13: + docker_target: clang13 nvcc10: docker_target: nvcc10 - nvcc10-debug: - docker_target: nvcc10-debug + nvcc11: + docker_target: nvcc11 + nvcc11-debug: + docker_target: nvcc11-debug hip: docker_target: hip -# sycl: -# docker_target: sycl + sycl: + docker_target: sycl pool: vmImage: 'ubuntu-latest' variables: DOCKER_BUILDKIT: '1' - CMAKE_EXTRA_FLAGS: '-DENABLE_DEVELOPER_BENCHMARKS=On -DENABLE_DEVELOPER_DEFAULTS=On -DCMAKE_CXX_STANDARD=11' + CMAKE_EXTRA_FLAGS: '-DENABLE_DEVELOPER_BENCHMARKS=On -DENABLE_DEVELOPER_DEFAULTS=On -DCMAKE_CXX_STANDARD=14' steps: - checkout: self clean: boolean @@ -66,10 +79,21 @@ jobs: command: build dockerFile: 'Dockerfile' arguments: '--target $(docker_target)' + - script: | + CID=$(docker create llnl/raja:$(Build.BuildId)) + echo ${CID} + docker cp ${CID}:/home/axom/workspace/build local-build + docker rm ${CID} + displayName: 'Copy test artifacts' + condition: ne( variables['docker_target'], 'nvcc') - script: | bash <(curl -s https://raw.githubusercontent.com/codecov/codecov-bash/0b376529f626b50b7d4a9fb734e0e50d28b9b91e/codecov) >& /dev/null displayName: 'Upload code coverage' condition: eq( variables['docker_target'], 'gcc') + - task: PublishTestResults@2 + inputs: + testResultsFormat: 'cTest' + testResultsFiles: '**/Test.xml' - job: Mac pool: vmImage: 'macOS-latest' @@ -85,11 +109,15 @@ jobs: cmakeArgs: '$(CMAKE_EXTRA_FLAGS) ../' - script: | cd build - make + make -j 4 displayName: 'OSX Build' condition: eq( variables['Agent.OS'], 'Darwin') - - script: | - cd build - ./bin/raja-perf.exe - displayName: 'Run Perf Suite' - condition: eq( variables['Agent.OS'], 'Darwin') +# - script: | +# cd build +# ctest -T test --output-on-failure +# displayName: 'OSX Test' +# condition: eq( variables['Agent.OS'], 'Darwin') +# - task: PublishTestResults@2 +# inputs: +# testResultsFormat: 'cTest' +# testResultsFiles: '**/Test.xml' diff --git a/tpl/RAJA b/tpl/RAJA index 0506cea3a..9380c5cbd 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 0506cea3aaad168de79df59a8df9fc6f27799aa3 +Subproject commit 9380c5cbdf26f7a1bda9dfe1a47cf0b8be916819 From a4120923fc5b8f0ed73fdb3fe0a85c48f3616ef2 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 23 Feb 2022 15:32:52 -0800 Subject: [PATCH 228/392] update BLT and build scripts --- blt | 2 +- scripts/lc-builds/blueos_nvcc_clang.sh | 1 - scripts/lc-builds/blueos_nvcc_xl.sh | 1 - scripts/lc-builds/blueos_xl.sh | 1 - scripts/lc-builds/blueos_xl_omptarget.sh | 1 - scripts/lc-builds/toss3_icpc.sh | 1 - scripts/lc-builds/toss4_amdclang.sh | 88 ++++++++++++++++++++++++ 7 files changed, 89 insertions(+), 6 deletions(-) create mode 100755 scripts/lc-builds/toss4_amdclang.sh diff --git a/blt b/blt index ddd5a0ca7..223512d34 160000 --- a/blt +++ b/blt @@ -1 +1 @@ -Subproject commit ddd5a0ca7c566d0ae14270b66625c8a363630ddb +Subproject commit 223512d349713c071ef9ed2c4ae8b5c22ceabe27 diff --git a/scripts/lc-builds/blueos_nvcc_clang.sh b/scripts/lc-builds/blueos_nvcc_clang.sh index 96a8c1421..00d046afa 100755 --- a/scripts/lc-builds/blueos_nvcc_clang.sh +++ b/scripts/lc-builds/blueos_nvcc_clang.sh @@ -41,7 +41,6 @@ module load cmake/3.14.5 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_CLANG_VER}/bin/clang++ \ - -DBLT_CXX_STD=c++11 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DENABLE_CUDA=On \ diff --git a/scripts/lc-builds/blueos_nvcc_xl.sh b/scripts/lc-builds/blueos_nvcc_xl.sh index 1263c1412..9dc80a283 100755 --- a/scripts/lc-builds/blueos_nvcc_xl.sh +++ b/scripts/lc-builds/blueos_nvcc_xl.sh @@ -41,7 +41,6 @@ module load cmake/3.14.5 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-${COMP_XL_VER}/bin/xlc++_r \ - -DBLT_CXX_STD=c++11 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DENABLE_CUDA=On \ diff --git a/scripts/lc-builds/blueos_xl.sh b/scripts/lc-builds/blueos_xl.sh index 6f4d961b6..6cb6b188d 100755 --- a/scripts/lc-builds/blueos_xl.sh +++ b/scripts/lc-builds/blueos_xl.sh @@ -34,7 +34,6 @@ module load cmake/3.14.5 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-${COMP_VER}/bin/xlc++_r \ - -DBLT_CXX_STD=c++11 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ diff --git a/scripts/lc-builds/blueos_xl_omptarget.sh b/scripts/lc-builds/blueos_xl_omptarget.sh index 42c4b5844..1e54df0be 100755 --- a/scripts/lc-builds/blueos_xl_omptarget.sh +++ b/scripts/lc-builds/blueos_xl_omptarget.sh @@ -34,7 +34,6 @@ module load cmake/3.14.5 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-${COMP_VER}/bin/xlc++_r \ - -DBLT_CXX_STD=c++11 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DENABLE_TARGET_OPENMP=On \ diff --git a/scripts/lc-builds/toss3_icpc.sh b/scripts/lc-builds/toss3_icpc.sh index 207a892b2..47ea2c846 100755 --- a/scripts/lc-builds/toss3_icpc.sh +++ b/scripts/lc-builds/toss3_icpc.sh @@ -48,7 +48,6 @@ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/bin/icpc \ -DCMAKE_C_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/bin/icc \ - -DBLT_CXX_STD=c++11 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ diff --git a/scripts/lc-builds/toss4_amdclang.sh b/scripts/lc-builds/toss4_amdclang.sh new file mode 100755 index 000000000..31de03725 --- /dev/null +++ b/scripts/lc-builds/toss4_amdclang.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [[ $# -lt 2 ]]; then + echo + echo "You must pass 2 or more arguments to the script (in this order): " + echo " 1) compiler version number" + echo " 2) HIP compute architecture" + echo " 3...) optional arguments to cmake" + echo + echo "For example: " + echo " toss4_amdclang.sh 4.1.0 gfx906" + exit +fi + +COMP_VER=$1 +COMP_ARCH=$2 +shift 2 + +MY_HIP_ARCH_FLAGS="--offload-arch=${COMP_ARCH}" +HOSTCONFIG="hip_3_X" + +if [[ ${COMP_VER} == 4.* ]] +then +##HIP_CLANG_FLAGS="${MY_HIP_ARCH_FLAGS} -mllvm -amdgpu-fixed-function-abi=1" + HOSTCONFIG="hip_4_link_X" +elif [[ ${COMP_VER} == 3.* ]] +then + HOSTCONFIG="hip_3_X" +else + echo "Unknown hip version, using ${HOSTCONFIG} host-config" +fi + +BUILD_SUFFIX=lc_toss4-amdclang-${COMP_VER}-${COMP_ARCH} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/${HOSTCONFIG}.cmake + +echo +echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo + +rm -rf build_${BUILD_SUFFIX} >/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + + +module load cmake/3.14.5 + +# unload rocm to avoid configuration problems where the loaded rocm and COMP_VER +# are inconsistent causing the rocprim from the module to be used unexpectedly +module unload rocm + + +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DROCM_ROOT_DIR="/opt/rocm-${COMP_VER}" \ + -DHIP_ROOT_DIR="/opt/rocm-${COMP_VER}/hip" \ + -DHIP_PATH=/opt/rocm-${COMP_VER}/llvm/bin \ + -DCMAKE_C_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/amdclang \ + -DCMAKE_CXX_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/amdclang++ \ + -DCMAKE_HIP_ARCHITECTURES="${MY_HIP_ARCH_FLAGS}" \ + -C ${RAJA_HOSTCONFIG} \ + -DENABLE_HIP=ON \ + -DENABLE_OPENMP=OFF \ + -DENABLE_CUDA=OFF \ + -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + "$@" \ + .. + +echo +echo "***********************************************************************" +echo +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA" +echo +echo " Please note that you have to have a consistent build environment" +echo " when you make RAJA as cmake may reconfigure; unload the rocm module" +echo " or load the appropriate rocm module (${COMP_VER}) when building." +echo +echo " module unload rocm" +echo " srun -n1 make" +echo +echo "***********************************************************************" From 8bebd6e9f645bfaae059a450e2d03dec621f7ba4 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 24 Feb 2022 13:15:56 -0800 Subject: [PATCH 229/392] Resolve cmake issues and compiler warnings (for now). --- CMakeLists.txt | 21 ++++++-------- src/apps/DIFFUSION3DPA-Cuda.cpp | 6 ++-- src/apps/DIFFUSION3DPA-Hip.cpp | 6 ++-- src/apps/DIFFUSION3DPA-OMP.cpp | 6 ++-- src/apps/DIFFUSION3DPA-Seq.cpp | 6 ++-- src/apps/DIFFUSION3DPA.hpp | 12 ++++---- src/apps/FEM_MACROS.hpp | 8 +++--- src/apps/MASS3DPA-Cuda.cpp | 2 +- src/apps/MASS3DPA-Hip.cpp | 2 +- src/apps/MASS3DPA-OMP.cpp | 2 +- src/apps/MASS3DPA-Seq.cpp | 2 +- src/apps/MASS3DPA.hpp | 50 ++++++++++++++++----------------- 12 files changed, 60 insertions(+), 63 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d3f1f69c..fee2f82d3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,7 +14,7 @@ option(ENABLE_RAJA_SEQUENTIAL "Run sequential variants of RAJA kernels. Disable this, and all other variants, to run _only_ raw C loops." On) # -# Initialize the BLT build system +# Note: the BLT build system is inheritted by RAJA and is initialized by RAJA # if (PERFSUITE_ENABLE_WARNINGS) @@ -25,8 +25,8 @@ set(ENABLE_TESTS Off CACHE BOOL "Enable BLT and RAJA tests") set(ENABLE_EXAMPLES Off CACHE BOOL "Enable RAJA examples") set(RAJA_ENABLE_EXERCISES Off CACHE BOOL "Enable RAJA exercises") -set(CMAKE_CXX_STANDARD 11) -set(BLT_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD 14) +set(BLT_CXX_STANDARD 14) include(blt/SetupBLT.cmake) @@ -60,14 +60,6 @@ get_property(RAJA_INCLUDE_DIRS DIRECTORY tpl/RAJA PROPERTY INCLUDE_DIRECTORIES) include_directories(${RAJA_INCLUDE_DIRS}) -# -# Setup variables to pass to Perf suite -# - -# -# These (hopefully temporary) macro constants are needed to work-around -# performance issues in the xl compiler. -# if (ENABLE_RAJA_SEQUENTIAL) add_definitions(-DRUN_RAJA_SEQ) endif () @@ -91,7 +83,12 @@ if (ENABLE_CUDA) list(APPEND RAJA_PERFSUITE_DEPENDS cuda) endif() if (ENABLE_HIP) - list(APPEND RAJA_PERFSUITE_DEPENDS hip) + message(STATUS "HIP version: ${hip_VERSION}") + if("${hip_VERSION}" VERSION_LESS "3.5") + message(FATAL_ERROR "Trying to use HIP/ROCm version ${hip_VERSION}. RAJA Perf Suite requires HIP/ROCm version 3.5 or newer. ") + endif() + list(APPEND RAJA_PERFSUITE_DEPENDS blt::hip) + list(APPEND RAJA_PERFSUITE_DEPENDS blt::hip_runtime) endif() set(RAJAPERF_BUILD_SYSTYPE $ENV{SYS_TYPE}) diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index fd4872bbc..6ef84bfec 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -7,7 +7,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // Uncomment to add compiler directives for loop unrolling -//#define USE_RAJA_UNROLL +//#define USE_RAJAPERF_UNROLL #include "DIFFUSION3DPA.hpp" @@ -199,7 +199,7 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { ctx.teamSync(); RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 1), - [&](int dz) { + [&](int RAJA_UNUSED_ARG(dz)) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dy) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), @@ -271,7 +271,7 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { ctx.teamSync(); RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 1), - [&](int dz) { + [&](int RAJA_UNUSED_ARG(dz)) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int d) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index ab6e2c734..221695dd5 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -7,7 +7,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // Uncomment to add compiler directives for loop unrolling -//#define USE_RAJA_UNROLL +//#define USE_RAJAPERF_UNROLL #include "DIFFUSION3DPA.hpp" @@ -200,7 +200,7 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { ctx.teamSync(); RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 1), - [&](int dz) { + [&](int RAJA_UNUSED_ARG(dz)) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dy) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), @@ -272,7 +272,7 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { ctx.teamSync(); RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 1), - [&](int dz) { + [&](int RAJA_UNUSED_ARG(dz)) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int d) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), diff --git a/src/apps/DIFFUSION3DPA-OMP.cpp b/src/apps/DIFFUSION3DPA-OMP.cpp index e4195e9f6..29ff7b108 100644 --- a/src/apps/DIFFUSION3DPA-OMP.cpp +++ b/src/apps/DIFFUSION3DPA-OMP.cpp @@ -7,7 +7,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // Uncomment to add compiler directives for loop unrolling -//#define USE_RAJA_UNROLL +//#define USE_RAJAPERF_UNROLL #include "DIFFUSION3DPA.hpp" @@ -182,7 +182,7 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 1), - [&](int dz) { + [&](int RAJA_UNUSED_ARG(dz)) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dy) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), @@ -254,7 +254,7 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 1), - [&](int dz) { + [&](int RAJA_UNUSED_ARG(dz)) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int d) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index 21a7678ca..159f7810b 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -7,7 +7,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // Uncomment to add compiler directives for loop unrolling -//#define USE_RAJA_UNROLL +//#define USE_RAJAPERF_UNROLL #include "DIFFUSION3DPA.hpp" @@ -180,7 +180,7 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 1), - [&](int dz) { + [&](int RAJA_UNUSED_ARG(dz)) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int dy) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), @@ -252,7 +252,7 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 1), - [&](int dz) { + [&](int RAJA_UNUSED_ARG(dz)) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](int d) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index f251ee16e..80d034195 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -334,7 +334,7 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) #define DIFFUSION3DPA_3 \ double u = 0.0, v = 0.0; \ - RAJA_UNROLL(MD1) \ + RAJAPERF_UNROLL(MD1) \ for (int dx = 0; dx < DPA_D1D; ++dx) \ { \ const int i = qi(qx,dx,DPA_Q1D); \ @@ -351,7 +351,7 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) #define DIFFUSION3DPA_4 \ double u = 0.0, v = 0.0, w = 0.0; \ - RAJA_UNROLL(MD1) \ + RAJAPERF_UNROLL(MD1) \ for (int dy = 0; dy < DPA_D1D; ++dy) \ { \ const int i = qi(qy,dy,DPA_Q1D); \ @@ -369,7 +369,7 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) #define DIFFUSION3DPA_5 \ double u = 0.0, v = 0.0, w = 0.0; \ - RAJA_UNROLL(MD1) \ + RAJAPERF_UNROLL(MD1) \ for (int dz = 0; dz < DPA_D1D; ++dz) \ { \ const int i = qi(qz,dz,DPA_Q1D); \ @@ -407,7 +407,7 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) #define DIFFUSION3DPA_7 \ double u = 0.0, v = 0.0, w = 0.0; \ - RAJA_UNROLL(MQ1) \ + RAJAPERF_UNROLL(MQ1) \ for (int qx = 0; qx < DPA_Q1D; ++qx) \ { \ const int i = qi(qx,dx,DPA_Q1D); \ @@ -425,7 +425,7 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) #define DIFFUSION3DPA_8 \ double u = 0.0, v = 0.0, w = 0.0; \ - RAJA_UNROLL(DPA_Q1D) \ + RAJAPERF_UNROLL(DPA_Q1D) \ for (int qy = 0; qy < DPA_Q1D; ++qy) \ { \ const int i = qi(qy,dy,DPA_Q1D); \ @@ -443,7 +443,7 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) #define DIFFUSION3DPA_9 \ double u = 0.0, v = 0.0, w = 0.0; \ - RAJA_UNROLL(MQ1) \ + RAJAPERF_UNROLL(MQ1) \ for (int qz = 0; qz < DPA_Q1D; ++qz) \ { \ const int i = qi(qz,dz,DPA_Q1D); \ diff --git a/src/apps/FEM_MACROS.hpp b/src/apps/FEM_MACROS.hpp index 8a0b1b400..f94a6e1a9 100644 --- a/src/apps/FEM_MACROS.hpp +++ b/src/apps/FEM_MACROS.hpp @@ -8,11 +8,11 @@ #ifndef RAJAPerf_FEM_MACROS_HPP #define RAJAPerf_FEM_MACROS_HPP -#define RAJA_DIRECT_PRAGMA(X) _Pragma(#X) -#if defined(USE_RAJA_UNROLL) -#define RAJA_UNROLL(N) RAJA_DIRECT_PRAGMA(unroll(N)) +#define RAJAPERF_DIRECT_PRAGMA(X) _Pragma(#X) +#if defined(USE_RAJAPERF_UNROLL) +#define RAJAPERF_UNROLL(N) RAJAPERF_DIRECT_PRAGMA(unroll(N)) #else -#define RAJA_UNROLL(N) +#define RAJAPERF_UNROLL(N) #endif // Need two different host/device macros due to diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp index 6354e01f4..2b4a682c4 100644 --- a/src/apps/MASS3DPA-Cuda.cpp +++ b/src/apps/MASS3DPA-Cuda.cpp @@ -7,7 +7,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // Uncomment to add compiler directives loop unrolling -//#define USE_RAJA_UNROLL +//#define USE_RAJAPERF_UNROLL #include "MASS3DPA.hpp" diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index 185ad7bf6..780a93e41 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -7,7 +7,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // Uncomment to add compiler directives loop unrolling -//#define USE_RAJA_UNROLL +//#define USE_RAJAPERF_UNROLL #include "MASS3DPA.hpp" diff --git a/src/apps/MASS3DPA-OMP.cpp b/src/apps/MASS3DPA-OMP.cpp index 95342832c..812440c26 100644 --- a/src/apps/MASS3DPA-OMP.cpp +++ b/src/apps/MASS3DPA-OMP.cpp @@ -7,7 +7,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // Uncomment to add compiler directives loop unrolling -//#define USE_RAJA_UNROLL +//#define USE_RAJAPERF_UNROLL #include "MASS3DPA.hpp" diff --git a/src/apps/MASS3DPA-Seq.cpp b/src/apps/MASS3DPA-Seq.cpp index 710826042..01948159a 100644 --- a/src/apps/MASS3DPA-Seq.cpp +++ b/src/apps/MASS3DPA-Seq.cpp @@ -7,7 +7,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // Uncomment to add compiler directives for loop unrolling -//#define USE_RAJA_UNROLL +//#define USE_RAJAPERF_UNROLL #include "MASS3DPA.hpp" diff --git a/src/apps/MASS3DPA.hpp b/src/apps/MASS3DPA.hpp index 76959f33b..24a64acab 100644 --- a/src/apps/MASS3DPA.hpp +++ b/src/apps/MASS3DPA.hpp @@ -213,7 +213,7 @@ Index_type NE = m_NE; double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1; #define MASS3DPA_1 \ - RAJA_UNROLL(MD1) \ + RAJAPERF_UNROLL(MD1) \ for (int dz = 0; dz< MPA_D1D; ++dz) { \ Xsmem[dz][dy][dx] = X_(dx, dy, dz, e); \ } @@ -224,18 +224,18 @@ Xsmem[dz][dy][dx] = X_(dx, dy, dz, e); \ // 2 * MPA_D1D * MPA_D1D * MPA_D1D * MPA_Q1D #define MASS3DPA_3 \ double u[MPA_D1D]; \ -RAJA_UNROLL(MD1) \ +RAJAPERF_UNROLL(MD1) \ for (int dz = 0; dz < MPA_D1D; dz++) { \ u[dz] = 0; \ } \ -RAJA_UNROLL(MD1) \ +RAJAPERF_UNROLL(MD1) \ for (int dx = 0; dx < MPA_D1D; ++dx) { \ -RAJA_UNROLL(MD1) \ +RAJAPERF_UNROLL(MD1) \ for (int dz = 0; dz < MPA_D1D; ++dz) { \ u[dz] += Xsmem[dz][dy][dx] * Bsmem[qx][dx]; \ } \ } \ -RAJA_UNROLL(MD1) \ +RAJAPERF_UNROLL(MD1) \ for (int dz = 0; dz < MPA_D1D; ++dz) { \ DDQ[dz][dy][qx] = u[dz]; \ } @@ -243,18 +243,18 @@ DDQ[dz][dy][qx] = u[dz]; \ //2 * MPA_D1D * MPA_D1D * MPA_Q1D * MPA_Q1D #define MASS3DPA_4 \ double u[MPA_D1D]; \ - RAJA_UNROLL(MD1) \ + RAJAPERF_UNROLL(MD1) \ for (int dz = 0; dz < MPA_D1D; dz++) { \ u[dz] = 0; \ } \ - RAJA_UNROLL(MD1) \ + RAJAPERF_UNROLL(MD1) \ for (int dy = 0; dy < MPA_D1D; ++dy) { \ - RAJA_UNROLL(MD1) \ + RAJAPERF_UNROLL(MD1) \ for (int dz = 0; dz < MPA_D1D; dz++) { \ u[dz] += DDQ[dz][dy][qx] * Bsmem[qy][dy]; \ } \ } \ - RAJA_UNROLL(MD1) \ + RAJAPERF_UNROLL(MD1) \ for (int dz = 0; dz < MPA_D1D; dz++) { \ DQQ[dz][qy][qx] = u[dz]; \ } @@ -262,18 +262,18 @@ DDQ[dz][dy][qx] = u[dz]; \ //2 * MPA_D1D * MPA_Q1D * MPA_Q1D * MPA_Q1D + MPA_Q1D * MPA_Q1D * MPA_Q1D #define MASS3DPA_5 \ double u[MPA_Q1D]; \ - RAJA_UNROLL(MQ1) \ + RAJAPERF_UNROLL(MQ1) \ for (int qz = 0; qz < MPA_Q1D; qz++) { \ u[qz] = 0; \ } \ - RAJA_UNROLL(MD1) \ + RAJAPERF_UNROLL(MD1) \ for (int dz = 0; dz < MPA_D1D; ++dz) { \ - RAJA_UNROLL(MQ1) \ + RAJAPERF_UNROLL(MQ1) \ for (int qz = 0; qz < MPA_Q1D; qz++) { \ u[qz] += DQQ[dz][qy][qx] * Bsmem[qz][dz]; \ } \ } \ - RAJA_UNROLL(MQ1) \ + RAJAPERF_UNROLL(MQ1) \ for (int qz = 0; qz < MPA_Q1D; qz++) { \ QQQ[qz][qy][qx] = u[qz] * D_(qx, qy, qz, e); \ } @@ -284,18 +284,18 @@ DDQ[dz][dy][qx] = u[dz]; \ //2 * MPA_Q1D * MPA_Q1D * MPA_Q1D * MPA_D1D #define MASS3DPA_7 \ double u[MPA_Q1D]; \ -RAJA_UNROLL(MQ1) \ +RAJAPERF_UNROLL(MQ1) \ for (int qz = 0; qz < MPA_Q1D; ++qz) { \ u[qz] = 0; \ } \ -RAJA_UNROLL(MQ1) \ +RAJAPERF_UNROLL(MQ1) \ for (int qx = 0; qx < MPA_Q1D; ++qx) { \ - RAJA_UNROLL(MQ1) \ + RAJAPERF_UNROLL(MQ1) \ for (int qz = 0; qz < MPA_Q1D; ++qz) { \ u[qz] += QQQ[qz][qy][qx] * Btsmem[dx][qx]; \ } \ } \ -RAJA_UNROLL(MQ1) \ +RAJAPERF_UNROLL(MQ1) \ for (int qz = 0; qz < MPA_Q1D; ++qz) { \ QQD[qz][qy][dx] = u[qz]; \ } @@ -303,18 +303,18 @@ for (int qz = 0; qz < MPA_Q1D; ++qz) { \ // 2 * MPA_Q1D * MPA_Q1D * MPA_D1D * MPA_D1D #define MASS3DPA_8 \ double u[MPA_Q1D]; \ - RAJA_UNROLL(MQ1) \ + RAJAPERF_UNROLL(MQ1) \ for (int qz = 0; qz < MPA_Q1D; ++qz) { \ u[qz] = 0; \ } \ - RAJA_UNROLL(MQ1) \ + RAJAPERF_UNROLL(MQ1) \ for (int qy = 0; qy < MPA_Q1D; ++qy) { \ - RAJA_UNROLL(MQ1) \ + RAJAPERF_UNROLL(MQ1) \ for (int qz = 0; qz < MPA_Q1D; ++qz) { \ u[qz] += QQD[qz][qy][dx] * Btsmem[dy][qy]; \ } \ } \ - RAJA_UNROLL(MQ1) \ + RAJAPERF_UNROLL(MQ1) \ for (int qz = 0; qz < MPA_Q1D; ++qz) { \ QDD[qz][dy][dx] = u[qz]; \ } @@ -322,18 +322,18 @@ for (int qz = 0; qz < MPA_Q1D; ++qz) { \ //2 * MPA_Q1D * MPA_D1D * MPA_D1D * MPA_D1D + MPA_D1D * MPA_D1D * MPA_D1D #define MASS3DPA_9 \ double u[MPA_D1D]; \ - RAJA_UNROLL(MD1) \ + RAJAPERF_UNROLL(MD1) \ for (int dz = 0; dz < MPA_D1D; ++dz) { \ u[dz] = 0; \ } \ - RAJA_UNROLL(MQ1) \ + RAJAPERF_UNROLL(MQ1) \ for (int qz = 0; qz < MPA_Q1D; ++qz) { \ - RAJA_UNROLL(MD1) \ + RAJAPERF_UNROLL(MD1) \ for (int dz = 0; dz < MPA_D1D; ++dz) { \ u[dz] += QDD[qz][dy][dx] * Btsmem[dz][qz]; \ } \ } \ - RAJA_UNROLL(MD1) \ + RAJAPERF_UNROLL(MD1) \ for (int dz = 0; dz < MPA_D1D; ++dz) { \ Y_(dx, dy, dz, e) += u[dz]; \ } From 3f748eb77e4efbfc32bac5e53247255848e53702 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 1 Mar 2022 09:15:24 -0800 Subject: [PATCH 230/392] Use braced init list to guarantee sequencing in seq_for --- src/common/GPUUtils.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 3dadda4b8..9f83cb804 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -159,7 +159,9 @@ using list_type = template void seq_for(camp::int_seq, Func&& func) { - camp::sink((func(camp::integral_constant{}), 0)...); + // braced init lists are evaluated in order + int seq_unused_array[] = {(func(camp::integral_constant{}), 0)...}; + RAJA_UNUSED_VAR(seq_unused_array); } template void seq_for(Func&& func) From 7511471a87c2590df09ae4525b6651ed0f0c2731 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 1 Mar 2022 09:27:20 -0800 Subject: [PATCH 231/392] Fix unused arg warnings --- src/algorithm/SORT.hpp | 2 +- src/algorithm/SORTPAIRS.hpp | 2 +- src/apps/WIP-COUPLE.hpp | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/algorithm/SORT.hpp b/src/algorithm/SORT.hpp index 4eb5909bb..46555ca94 100644 --- a/src/algorithm/SORT.hpp +++ b/src/algorithm/SORT.hpp @@ -50,7 +50,7 @@ class SORT : public KernelBase void runOpenMPVariant(VariantID vid, size_t tid); void runCudaVariant(VariantID vid, size_t tid); void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid) + void runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { getCout() << "\n SORT : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/algorithm/SORTPAIRS.hpp b/src/algorithm/SORTPAIRS.hpp index 1c4406141..e965446f5 100644 --- a/src/algorithm/SORTPAIRS.hpp +++ b/src/algorithm/SORTPAIRS.hpp @@ -49,7 +49,7 @@ class SORTPAIRS : public KernelBase void runOpenMPVariant(VariantID vid, size_t tid); void runCudaVariant(VariantID vid, size_t tid); void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid) + void runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { getCout() << "\n SORTPAIRS : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/apps/WIP-COUPLE.hpp b/src/apps/WIP-COUPLE.hpp index 3caa1bbb6..e87290244 100644 --- a/src/apps/WIP-COUPLE.hpp +++ b/src/apps/WIP-COUPLE.hpp @@ -166,11 +166,11 @@ class COUPLE : public KernelBase void updateChecksum(VariantID vid, size_t tid); void tearDown(VariantID vid, size_t tid); - void runSeqVariant(VariantID vid, size_t tid) {(void) vid;} - void runOpenMPVariant(VariantID vid, size_t tid) {(void) vid;} - void runCudaVariant(VariantID vid, size_t tid) {(void) vid;} - void runHipVariant(VariantID vid, size_t tid) {(void) vid;} - void runOpenMPTargetVariant(VariantID vid, size_t tid) {(void) vid;} + void runSeqVariant(VariantID vid, size_t /*tid*/) {(void) vid;} + void runOpenMPVariant(VariantID vid, size_t /*tid*/) {(void) vid;} + void runCudaVariant(VariantID vid, size_t /*tid*/) {(void) vid;} + void runHipVariant(VariantID vid, size_t /*tid*/) {(void) vid;} + void runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) {(void) vid;} private: Complex_ptr m_t0; From 7fc972de458bf5613d71cd61034112ef761b57bc Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 4 Mar 2022 13:39:21 -0800 Subject: [PATCH 232/392] Fix resource use in DEL_DOT_VEC_2D --- src/apps/DEL_DOT_VEC_2D-Cuda.cpp | 2 +- src/apps/DEL_DOT_VEC_2D-Hip.cpp | 2 +- src/apps/DEL_DOT_VEC_2D-OMP.cpp | 2 +- src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp | 2 +- src/apps/DEL_DOT_VEC_2D-Seq.cpp | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp index 0321add45..1eb095a66 100644 --- a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp @@ -139,7 +139,7 @@ void DEL_DOT_VEC_2D::runCudaVariantImpl(VariantID vid) NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ; - camp::resources::Resource working_res{camp::resources::Cuda()}; + camp::resources::Resource working_res{camp::resources::Cuda::get_default()}; RAJA::TypedListSegment zones(m_domain->real_zones, m_domain->n_real_zones, working_res); diff --git a/src/apps/DEL_DOT_VEC_2D-Hip.cpp b/src/apps/DEL_DOT_VEC_2D-Hip.cpp index 9c726e382..445277a34 100644 --- a/src/apps/DEL_DOT_VEC_2D-Hip.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Hip.cpp @@ -141,7 +141,7 @@ void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid) NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ; - camp::resources::Resource working_res{camp::resources::Hip()}; + camp::resources::Resource working_res{camp::resources::Hip::get_default()}; RAJA::TypedListSegment zones(m_domain->real_zones, m_domain->n_real_zones, working_res); diff --git a/src/apps/DEL_DOT_VEC_2D-OMP.cpp b/src/apps/DEL_DOT_VEC_2D-OMP.cpp index 2b3c5b2e3..f804acb44 100644 --- a/src/apps/DEL_DOT_VEC_2D-OMP.cpp +++ b/src/apps/DEL_DOT_VEC_2D-OMP.cpp @@ -79,7 +79,7 @@ void DEL_DOT_VEC_2D::runOpenMPVariant(VariantID vid, size_t /*tid*/) case RAJA_OpenMP : { - camp::resources::Resource working_res{camp::resources::Host()}; + camp::resources::Resource working_res{camp::resources::Host::get_default()}; RAJA::TypedListSegment zones(m_domain->real_zones, m_domain->n_real_zones, working_res); diff --git a/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp index 424d940aa..492e02a60 100644 --- a/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp +++ b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp @@ -94,7 +94,7 @@ void DEL_DOT_VEC_2D::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ; - camp::resources::Resource working_res{camp::resources::Omp()}; + camp::resources::Resource working_res{camp::resources::Omp::get_default()}; RAJA::TypedListSegment zones(m_domain->real_zones, m_domain->n_real_zones, working_res); diff --git a/src/apps/DEL_DOT_VEC_2D-Seq.cpp b/src/apps/DEL_DOT_VEC_2D-Seq.cpp index 9be1cc853..f81e2d741 100644 --- a/src/apps/DEL_DOT_VEC_2D-Seq.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Seq.cpp @@ -76,7 +76,7 @@ void DEL_DOT_VEC_2D::runSeqVariant(VariantID vid, size_t /*tid*/) case RAJA_Seq : { - camp::resources::Resource working_res{camp::resources::Host()}; + camp::resources::Resource working_res{camp::resources::Host::get_default()}; RAJA::TypedListSegment zones(m_domain->real_zones, m_domain->n_real_zones, working_res); From 96201be65a796e38cedcaf78cb68aa1d1d85f6dd Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 8 Mar 2022 10:03:35 -0800 Subject: [PATCH 233/392] Update CI stuff to match what RAJA is doing --- .gitlab-ci.yml | 12 ++++++------ .gitlab/corona-jobs.yml | 18 ++++-------------- .gitlab/corona-templates.yml | 2 +- .gitlab/lassen-jobs.yml | 8 ++++---- scripts/gitlab/build_and_test.sh | 16 +++++++++++++--- scripts/radiuss-spack-configs | 2 +- 6 files changed, 29 insertions(+), 29 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 77393bc9a..32d794b64 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -51,7 +51,7 @@ stages: - r_release_resources - l_build_and_test - b_build_and_test -# - c_build_and_test + - c_build_and_test - multi_project # This is the rules that drives the activation of "advanced" jobs. All advanced @@ -73,9 +73,9 @@ stages: reports: junit: junit.xml -#.build_toss_3_x86_64_ib_corona_script: -# script: -# - srun -p mi60 -t 30 -N 1 scripts/gitlab/build_and_test.sh +.build_toss_4_x86_64_ib_corona_script: + script: + - srun -p pbatch -t 30 -N 1 scripts/gitlab/build_and_test.sh # Lassen and Butte use a different job scheduler (spectrum lsf) that does not # allow pre-allocation the same way slurm does. @@ -117,5 +117,5 @@ include: - local: .gitlab/ruby-jobs.yml - local: .gitlab/lassen-templates.yml - local: .gitlab/lassen-jobs.yml -# - local: .gitlab/corona-templates.yml -# - local: .gitlab/corona-jobs.yml + - local: .gitlab/corona-templates.yml + - local: .gitlab/corona-jobs.yml diff --git a/.gitlab/corona-jobs.yml b/.gitlab/corona-jobs.yml index 2a60c9c64..d5e72f6fe 100644 --- a/.gitlab/corona-jobs.yml +++ b/.gitlab/corona-jobs.yml @@ -5,22 +5,12 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################# -hip_4_1_gcc_8_1_0 (build and test on corona): +hip_4_5_2_clang_13_0_0 (build and test on corona): variables: - SPEC: "+rocm~openmp amdgpu_target=gfx906 %gcc@8.1.0 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000' ^hip@4.1.0" + SPEC: "+rocm~openmp amdgpu_target=gfx906 %clang@13.0.0 cxxflags=--offload-arch=gfx906 ^blt@develop ^hip@4.5.2" extends: .build_and_test_on_corona -hip_4_2_gcc_8_1_0 (build and test on corona): +hip_4_5_2_clang_13_0_0_desul_atomics (build and test on corona): variables: - SPEC: "+rocm~openmp amdgpu_target=gfx906 %gcc@8.1.0 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000' ^hip@4.2.0" - extends: .build_and_test_on_corona - -hip_4_1_clang_9_0_0 (build and test on corona): - variables: - SPEC: "+rocm~openmp amdgpu_target=gfx906 %clang@9.0.0 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 ^hip@4.1.0" - extends: .build_and_test_on_corona - -hip_4_1_gcc_8_1_0_desul_atomics (build and test on corona): - variables: - SPEC: "+rocm~openmp +desul amdgpu_target=gfx906 %gcc@8.1.0 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000' ^hip@4.1.0" + SPEC: "+rocm~openmp +desul amdgpu_target=gfx906 %clang@13.0.0 cxxflags=--offload-arch=gfx906 ^blt@develop ^hip@4.5.2" extends: .build_and_test_on_corona diff --git a/.gitlab/corona-templates.yml b/.gitlab/corona-templates.yml index 4c4ce2883..4e1a5cb74 100644 --- a/.gitlab/corona-templates.yml +++ b/.gitlab/corona-templates.yml @@ -25,7 +25,7 @@ # Generic corona build job, extending build script .build_and_test_on_corona: stage: c_build_and_test - extends: [.build_toss_3_x86_64_ib_corona_script, .on_corona] + extends: [.build_toss_4_x86_64_ib_corona_script, .on_corona] needs: [] .build_and_test_on_corona_advanced: diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index ccdf55c85..8b5d07099 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -24,15 +24,15 @@ gcc_8_3_1: SPEC: "%gcc@8.3.1 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000'" extends: .build_and_test_on_lassen -xl_16_1_1_7: +xl_16_1_1_11: variables: - SPEC: "%xl@16.1.1.7 cxxflags='-qthreaded -std=c++14 -O3 -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036'" + SPEC: "%xl@16.1.1.11 cxxflags='-qthreaded -std=c++14 -O3 -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036'" DEFAULT_TIME: 50 extends: .build_and_test_on_lassen -xl_16_1_1_7_gcc_8_3_1: +xl_16_1_1_11_gcc_8_3_1: variables: - SPEC: "%xl@16.1.1.7 cxxflags='--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O3 -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" + SPEC: "%xl@16.1.1.11 cxxflags='--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O3 -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" DEFAULT_TIME: 50 extends: .build_and_test_on_lassen diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 36b080699..b7246db7f 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -2,7 +2,7 @@ ############################################################################### # Copyright (c) 2016-21, Lawrence Livermore National Security, LLC -# and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### @@ -56,7 +56,7 @@ then prefix_opt="--prefix=${prefix}" fi - python scripts/uberenv/uberenv.py --spec="${spec}" ${prefix_opt} + python3 scripts/uberenv/uberenv.py --spec="${spec}" ${prefix_opt} fi date @@ -104,6 +104,10 @@ then echo "~ Build Dir: ${build_dir}" echo "~ Project Dir: ${project_dir}" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~ ENV ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "~~~~~ Building RAJA PerfSuite" @@ -123,7 +127,7 @@ then echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "~~~~ Updating Submodules within RAJA ~~~~~~" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - git submodule init && git submodule update --recursive + git submodule update --init --recursive cd - fi @@ -135,6 +139,12 @@ then mkdir -p ${build_dir} && cd ${build_dir} date + + if [[ "${truehostname}" == "corona" ]] + then + module unload rocm + fi + cmake \ -C ${hostconfig_path} \ ${project_dir} diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index 56be82d3f..7759bb0f9 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit 56be82d3f644fef0870da3272d03b916f89c53c7 +Subproject commit 7759bb0f9f04fee0b85e9afc065cf2b5445c849e From c7f987abca75c8ad6b14efbe30dcdb671342d7ed Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 8 Mar 2022 10:23:49 -0800 Subject: [PATCH 234/392] Attempt to update azure CI --- azure-pipelines.yml | 81 +++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 43 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 6b40fa89a..65c3d3c89 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -5,29 +5,29 @@ variables: COMPILER: 'g++' jobs: -- job: Windows #temporarily commenting out until cmake/azure version issue resolved - strategy: - matrix: - shared: - SHARED_ARGS: '-DBUILD_SHARED_LIBS=On -DCMAKE_CXX_FLAGS="/DRAJASHAREDDLL_EXPORTS" ' - static: - SHARED_ARGS: '-DBUILD_SHARED_LIBS=Off' - pool: - vmImage: 'windows-2019' - variables: - CMAKE_EXTRA_FLAGS: '-DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off -DBLT_CXX_STD="" -DCMAKE_CXX_STANDARD=17' - steps: - - checkout: self - clean: boolean - submodules: recursive - - task: CMake@1 - inputs: - workingDir: 'build' - cmakeArgs: '$(CMAKE_EXTRA_FLAGS) $(SHARED_ARGS) ../' - - task: CMake@1 - inputs: - workingDir: 'build' - cmakeArgs: '--build . --config Release --verbose -j 4' +#- job: Windows #temporarily commenting out until cmake/azure version issue resolved +# strategy: +# matrix: +# shared: +# SHARED_ARGS: '-DBUILD_SHARED_LIBS=On -DCMAKE_CXX_FLAGS="/DRAJASHAREDDLL_EXPORTS" ' +# static: +# SHARED_ARGS: '-DBUILD_SHARED_LIBS=Off' +# pool: +# vmImage: 'windows-2019' +# variables: +# CMAKE_EXTRA_FLAGS: '-DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off -DBLT_CXX_STD="" -DCMAKE_CXX_STANDARD=17' +# steps: +# - checkout: self +# clean: boolean +# submodules: recursive +# - task: CMake@1 +# inputs: +# workingDir: 'build' +# cmakeArgs: '$(CMAKE_EXTRA_FLAGS) $(SHARED_ARGS) ../' +# - task: CMake@1 +# inputs: +# workingDir: 'build' +# cmakeArgs: '--build . --config Release --verbose -j 4' # - task: CmdLine@2 # inputs: # script: 'ctest.exe -T test -C Release' @@ -63,8 +63,8 @@ jobs: docker_target: nvcc11-debug hip: docker_target: hip - sycl: - docker_target: sycl +# sycl: +# docker_target: sycl pool: vmImage: 'ubuntu-latest' variables: @@ -84,16 +84,15 @@ jobs: echo ${CID} docker cp ${CID}:/home/axom/workspace/build local-build docker rm ${CID} - displayName: 'Copy test artifacts' + - script: | + cd build + make -j + displayName: 'Build Perf Suite' + - script: | + cd build + ./bin/raja-perf.exe --checkrun -sp + displayName: 'Run Perf Suite' condition: ne( variables['docker_target'], 'nvcc') - - script: | - bash <(curl -s https://raw.githubusercontent.com/codecov/codecov-bash/0b376529f626b50b7d4a9fb734e0e50d28b9b91e/codecov) >& /dev/null - displayName: 'Upload code coverage' - condition: eq( variables['docker_target'], 'gcc') - - task: PublishTestResults@2 - inputs: - testResultsFormat: 'cTest' - testResultsFiles: '**/Test.xml' - job: Mac pool: vmImage: 'macOS-latest' @@ -112,12 +111,8 @@ jobs: make -j 4 displayName: 'OSX Build' condition: eq( variables['Agent.OS'], 'Darwin') -# - script: | -# cd build -# ctest -T test --output-on-failure -# displayName: 'OSX Test' -# condition: eq( variables['Agent.OS'], 'Darwin') -# - task: PublishTestResults@2 -# inputs: -# testResultsFormat: 'cTest' -# testResultsFiles: '**/Test.xml' + - script: | + cd build + ./bin/raja-perf.exe --checkrun -sp + displayName: 'Run Perf Suite' + condition: eq( variables['Agent.OS'], 'Darwin') From 27730691605449e4483b318d42b1a6cdc6c30226 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 8 Mar 2022 10:27:04 -0800 Subject: [PATCH 235/392] Fix indentation --- azure-pipelines.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 65c3d3c89..b043cbf2a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -84,11 +84,11 @@ jobs: echo ${CID} docker cp ${CID}:/home/axom/workspace/build local-build docker rm ${CID} - - script: | + - script: | cd build make -j displayName: 'Build Perf Suite' - - script: | + - script: | cd build ./bin/raja-perf.exe --checkrun -sp displayName: 'Run Perf Suite' From f0e7441358413c67a34d766060d4cda3ea334a89 Mon Sep 17 00:00:00 2001 From: Mike Date: Tue, 8 Mar 2022 11:52:30 -0800 Subject: [PATCH 236/392] Dockerfile runs raja-perf.exe --- Dockerfile | 16 ++++++++-------- azure-pipelines.yml | 14 -------------- 2 files changed, 8 insertions(+), 22 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0e03f9694..38637f5ca 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,7 +11,7 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_TBB=On -DRAJA_DEPRECATED_TESTS=On .. && \ make -j 6 &&\ - ctest -T test --output-on-failure + ./bin/raja-perf.exe -sp FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-8.1.0 AS gcc8 ENV GTEST_COLOR=1 @@ -19,7 +19,7 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_COVERAGE=On -DRAJA_ENABLE_TBB=On .. && \ make -j 6 &&\ - ctest -T test --output-on-failure + ./bin/raja-perf.exe -sp FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-9.4.0 AS gcc9 ENV GTEST_COLOR=1 @@ -27,7 +27,7 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_TBB=On -DRAJA_ENABLE_RUNTIME_PLUGINS=On .. && \ make -j 6 &&\ - ctest -T test --output-on-failure + ./bin/raja-perf.exe -sp FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-11.2.0 AS gcc11 ENV GTEST_COLOR=1 @@ -35,7 +35,7 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_TBB=On -DRAJA_ENABLE_BOUNDS_CHECK=ON .. && \ make -j 6 &&\ - ctest -T test --output-on-failure + ./bin/raja-perf.exe -sp FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11 ENV GTEST_COLOR=1 @@ -43,7 +43,7 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DRAJA_ENABLE_TBB=On .. && \ make -j 6 &&\ - ctest -T test --output-on-failure + ./bin/raja-perf.exe -sp FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11-debug ENV GTEST_COLOR=1 @@ -51,7 +51,7 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug .. && \ make -j 6 &&\ - ctest -T test --output-on-failure + ./bin/raja-perf.exe -sp FROM ghcr.io/rse-ops/clang-ubuntu-22.04:llvm-13.0.0 AS clang13 ENV GTEST_COLOR=1 @@ -59,7 +59,7 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release .. && \ make -j 6 &&\ - ctest -T test --output-on-failure + ./bin/raja-perf.exe -sp FROM ghcr.io/rse-ops/cuda:cuda-10.1.243-ubuntu-18.04 AS nvcc10 ENV GTEST_COLOR=1 @@ -101,4 +101,4 @@ WORKDIR /home/raja/workspace/build RUN /bin/bash -c "source /opt/view/setvars.sh && \ cmake -DCMAKE_CXX_COMPILER=dpcpp -DRAJA_ENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 .. && \ make -j 6 &&\ - ctest -T test --output-on-failure" + ./bin/raja-perf.exe -sp" diff --git a/azure-pipelines.yml b/azure-pipelines.yml index b043cbf2a..2dc813310 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -79,20 +79,6 @@ jobs: command: build dockerFile: 'Dockerfile' arguments: '--target $(docker_target)' - - script: | - CID=$(docker create llnl/raja:$(Build.BuildId)) - echo ${CID} - docker cp ${CID}:/home/axom/workspace/build local-build - docker rm ${CID} - - script: | - cd build - make -j - displayName: 'Build Perf Suite' - - script: | - cd build - ./bin/raja-perf.exe --checkrun -sp - displayName: 'Run Perf Suite' - condition: ne( variables['docker_target'], 'nvcc') - job: Mac pool: vmImage: 'macOS-latest' From 79cbce97f554067a5e077a9d3b0e5c9a6327a3ac Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 8 Mar 2022 13:10:06 -0800 Subject: [PATCH 237/392] Try fixing C++ std --- CMakeLists.txt | 55 +++++++++++++++++++-- scripts/spack_packages/raja_perf/package.py | 12 ++--- 2 files changed, 55 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fee2f82d3..8d60b7c67 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,10 +25,59 @@ set(ENABLE_TESTS Off CACHE BOOL "Enable BLT and RAJA tests") set(ENABLE_EXAMPLES Off CACHE BOOL "Enable RAJA examples") set(RAJA_ENABLE_EXERCISES Off CACHE BOOL "Enable RAJA exercises") -set(CMAKE_CXX_STANDARD 14) -set(BLT_CXX_STANDARD 14) +include(CheckCXXCompilerFlag) +if(NOT DEFINED BLT_CXX_STD) + if("cxx_std_17" IN_LIST CMAKE_CXX_KNOWN_FEATURES) + set(BLT_CXX_STD c++17 CACHE STRING "Version of C++ standard") + message("Using C++ standard: ${BLT_CXX_STD}") + elseif("cxx_std_14" IN_LIST CMAKE_CXX_KNOWN_FEATURES) + set(BLT_CXX_STD c++14 CACHE STRING "Version of C++ standard") + message("Using C++ standard: ${BLT_CXX_STD}") + elseif("${CMAKE_CXX_COMPILER_ID}" IN_LIST COMPILERS_KNOWN_TO_CMAKE33) + set(BLT_CXX_STD c++14 CACHE STRING "Version of C++ standard") + message("Using C++ standard: ${BLT_CXX_STD}") + else() #cmake has no idea what to do, do it ourselves... + foreach(flag_var "c++17" "c++14" "c++11") + CHECK_CXX_COMPILER_FLAG("-std=${flag_var}" COMPILER_SUPPORTS_${flag_var}) + if(COMPILER_SUPPORTS_${flag_var}) + set(BLT_CXX_STD ${flag_var} CACHE STRING "Version of C++ standard") + message("Using C++ standard: ${BLT_CXX_STD}") + break() + endif() + endforeach(flag_var) + endif() +else() #check BLT_CXX_STD is high enough by disallowing the only invalid option + if("${BLT_CXX_STD}" STREQUAL "c++98") + message(FATAL_ERROR "RAJA requires minimum C++ standard of c++11") + endif() +endif(NOT DEFINED BLT_CXX_STD) +if (RAJA_ENABLE_DESUL_ATOMICS) + if("${BLT_CXX_STD}" STREQUAL "c++11") + message(FATAL_ERROR "RAJA_ENABLE_DESUL_ATOMICS requires minimum C++ standard of c++14") + endif() +endif() -include(blt/SetupBLT.cmake) +set(CMAKE_CXX_EXTENSIONS OFF) + +if (NOT BLT_LOADED) + if (DEFINED BLT_SOURCE_DIR) + if (NOT EXISTS ${BLT_SOURCE_DIR}/SetupBLT.cmake) + message(FATAL_ERROR "Given BLT_SOURCE_DIR does not contain SetupBLT.cmake") + endif() + else () + set (BLT_SOURCE_DIR ${PROJECT_SOURCE_DIR}/blt CACHE PATH "") + + if (NOT EXISTS ${BLT_SOURCE_DIR}/SetupBLT.cmake) + message(FATAL_ERROR "\ + The BLT submodule is not present. \ + If in git repository run the following two commands:\n \ + git submodule init\n \ + git submodule update") + endif () + endif () + + include(${BLT_SOURCE_DIR}/SetupBLT.cmake) +endif() # # Define RAJA PERFSUITE settings... diff --git a/scripts/spack_packages/raja_perf/package.py b/scripts/spack_packages/raja_perf/package.py index 70db71fa4..c180bdd46 100644 --- a/scripts/spack_packages/raja_perf/package.py +++ b/scripts/spack_packages/raja_perf/package.py @@ -59,31 +59,25 @@ class RajaPerf(CMakePackage, CudaPackage): version('develop', branch='develop', submodules='True') version('main', branch='main', submodules='True') - version('0.12.1', tag='v0.12.1', submodules="True") - version('0.12.0', tag='v0.12.0', submodules="True") version('0.11.0', tag='v0.11.0', submodules="True") - version('0.10.1', tag='v0.10.1', submodules="True") version('0.10.0', tag='v0.10.0', submodules="True") version('0.9.0', tag='v0.9.0', submodules="True") version('0.8.0', tag='v0.8.0', submodules="True") version('0.7.0', tag='v0.7.0', submodules="True") version('0.6.0', tag='v0.6.0', submodules="True") - version('0.5.3', tag='v0.5.3', submodules="True") version('0.5.2', tag='v0.5.2', submodules="True") version('0.5.1', tag='v0.5.1', submodules="True") version('0.5.0', tag='v0.5.0', submodules="True") - version('0.4.1', tag='v0.4.1', submodules="True") version('0.4.0', tag='v0.4.0', submodules="True") variant('openmp', default=True, description='Build OpenMP backend') variant('openmp_target', default=False, description='Build with OpenMP target support') variant('shared', default=False, description='Build Shared Libs') variant('libcpp', default=False, description='Uses libc++ instead of libstdc++') - variant('hip', default=False, description='Build with HIP support') variant('tests', default='basic', values=('none', 'basic', 'benchmarks'), multi=False, description='Tests to run') - depends_on('cmake@3.8:', type='build') + depends_on('cmake@3.9:', type='build') depends_on('cmake@3.9:', when='+cuda', type='build') depends_on('hip', when='+hip') @@ -169,8 +163,8 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cfg.write("###################\n".format("#" * 60)) cfg.write("# Generated host-config - Edit at own risk!\n") cfg.write("###################\n".format("#" * 60)) - cfg.write("# Copyright (c) 2020, Lawrence Livermore National Security, LLC and\n") - cfg.write("# other Umpire Project Developers. See the top-level LICENSE file for\n") + cfg.write("# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC and\n") + cfg.write("# other RAJAPerf contributors. See the top-level LICENSE file for\n") cfg.write("# details.\n") cfg.write("#\n") cfg.write("# SPDX-License-Identifier: (BSD-3-Clause) \n") From 992d893b21b94daa87e33e445f07a5747c0307bd Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 8 Mar 2022 13:22:20 -0800 Subject: [PATCH 238/392] lower number of kernel reps to 10 for CI runs --- Dockerfile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index 38637f5ca..c2ee0dae2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,7 +11,7 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_TBB=On -DRAJA_DEPRECATED_TESTS=On .. && \ make -j 6 &&\ - ./bin/raja-perf.exe -sp + ./bin/raja-perf.exe --checkrun 10 -sp FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-8.1.0 AS gcc8 ENV GTEST_COLOR=1 @@ -19,7 +19,7 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_COVERAGE=On -DRAJA_ENABLE_TBB=On .. && \ make -j 6 &&\ - ./bin/raja-perf.exe -sp + ./bin/raja-perf.exe --checkrun 10 -sp FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-9.4.0 AS gcc9 ENV GTEST_COLOR=1 @@ -27,7 +27,7 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_TBB=On -DRAJA_ENABLE_RUNTIME_PLUGINS=On .. && \ make -j 6 &&\ - ./bin/raja-perf.exe -sp + ./bin/raja-perf.exe --checkrun 10 -sp FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-11.2.0 AS gcc11 ENV GTEST_COLOR=1 @@ -35,7 +35,7 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_TBB=On -DRAJA_ENABLE_BOUNDS_CHECK=ON .. && \ make -j 6 &&\ - ./bin/raja-perf.exe -sp + ./bin/raja-perf.exe --checkrun 10 -sp FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11 ENV GTEST_COLOR=1 @@ -43,7 +43,7 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DRAJA_ENABLE_TBB=On .. && \ make -j 6 &&\ - ./bin/raja-perf.exe -sp + ./bin/raja-perf.exe --checkrun 10 -sp FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11-debug ENV GTEST_COLOR=1 @@ -51,7 +51,7 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug .. && \ make -j 6 &&\ - ./bin/raja-perf.exe -sp + ./bin/raja-perf.exe --checkrun 10 -sp FROM ghcr.io/rse-ops/clang-ubuntu-22.04:llvm-13.0.0 AS clang13 ENV GTEST_COLOR=1 @@ -59,7 +59,7 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release .. && \ make -j 6 &&\ - ./bin/raja-perf.exe -sp + ./bin/raja-perf.exe --checkrun 10 -sp FROM ghcr.io/rse-ops/cuda:cuda-10.1.243-ubuntu-18.04 AS nvcc10 ENV GTEST_COLOR=1 @@ -101,4 +101,4 @@ WORKDIR /home/raja/workspace/build RUN /bin/bash -c "source /opt/view/setvars.sh && \ cmake -DCMAKE_CXX_COMPILER=dpcpp -DRAJA_ENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 .. && \ make -j 6 &&\ - ./bin/raja-perf.exe -sp" + ./bin/raja-perf.exe --checkrun 10 -sp" From ab888cb207ab13ce1560c2764dd501feefe690e9 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 8 Mar 2022 13:32:58 -0800 Subject: [PATCH 239/392] Back out previous change and force C++14 --- CMakeLists.txt | 55 +++----------------------------------------------- 1 file changed, 3 insertions(+), 52 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d60b7c67..fee2f82d3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,59 +25,10 @@ set(ENABLE_TESTS Off CACHE BOOL "Enable BLT and RAJA tests") set(ENABLE_EXAMPLES Off CACHE BOOL "Enable RAJA examples") set(RAJA_ENABLE_EXERCISES Off CACHE BOOL "Enable RAJA exercises") -include(CheckCXXCompilerFlag) -if(NOT DEFINED BLT_CXX_STD) - if("cxx_std_17" IN_LIST CMAKE_CXX_KNOWN_FEATURES) - set(BLT_CXX_STD c++17 CACHE STRING "Version of C++ standard") - message("Using C++ standard: ${BLT_CXX_STD}") - elseif("cxx_std_14" IN_LIST CMAKE_CXX_KNOWN_FEATURES) - set(BLT_CXX_STD c++14 CACHE STRING "Version of C++ standard") - message("Using C++ standard: ${BLT_CXX_STD}") - elseif("${CMAKE_CXX_COMPILER_ID}" IN_LIST COMPILERS_KNOWN_TO_CMAKE33) - set(BLT_CXX_STD c++14 CACHE STRING "Version of C++ standard") - message("Using C++ standard: ${BLT_CXX_STD}") - else() #cmake has no idea what to do, do it ourselves... - foreach(flag_var "c++17" "c++14" "c++11") - CHECK_CXX_COMPILER_FLAG("-std=${flag_var}" COMPILER_SUPPORTS_${flag_var}) - if(COMPILER_SUPPORTS_${flag_var}) - set(BLT_CXX_STD ${flag_var} CACHE STRING "Version of C++ standard") - message("Using C++ standard: ${BLT_CXX_STD}") - break() - endif() - endforeach(flag_var) - endif() -else() #check BLT_CXX_STD is high enough by disallowing the only invalid option - if("${BLT_CXX_STD}" STREQUAL "c++98") - message(FATAL_ERROR "RAJA requires minimum C++ standard of c++11") - endif() -endif(NOT DEFINED BLT_CXX_STD) -if (RAJA_ENABLE_DESUL_ATOMICS) - if("${BLT_CXX_STD}" STREQUAL "c++11") - message(FATAL_ERROR "RAJA_ENABLE_DESUL_ATOMICS requires minimum C++ standard of c++14") - endif() -endif() +set(CMAKE_CXX_STANDARD 14) +set(BLT_CXX_STANDARD 14) -set(CMAKE_CXX_EXTENSIONS OFF) - -if (NOT BLT_LOADED) - if (DEFINED BLT_SOURCE_DIR) - if (NOT EXISTS ${BLT_SOURCE_DIR}/SetupBLT.cmake) - message(FATAL_ERROR "Given BLT_SOURCE_DIR does not contain SetupBLT.cmake") - endif() - else () - set (BLT_SOURCE_DIR ${PROJECT_SOURCE_DIR}/blt CACHE PATH "") - - if (NOT EXISTS ${BLT_SOURCE_DIR}/SetupBLT.cmake) - message(FATAL_ERROR "\ - The BLT submodule is not present. \ - If in git repository run the following two commands:\n \ - git submodule init\n \ - git submodule update") - endif () - endif () - - include(${BLT_SOURCE_DIR}/SetupBLT.cmake) -endif() +include(blt/SetupBLT.cmake) # # Define RAJA PERFSUITE settings... From 695d9a732028196c62761cb7f68282d447577bec Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 9 Mar 2022 08:57:49 -0800 Subject: [PATCH 240/392] Use correct variable to set BLT C++ standard --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fee2f82d3..d9faf9bf0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,7 +26,7 @@ set(ENABLE_EXAMPLES Off CACHE BOOL "Enable RAJA examples") set(RAJA_ENABLE_EXERCISES Off CACHE BOOL "Enable RAJA exercises") set(CMAKE_CXX_STANDARD 14) -set(BLT_CXX_STANDARD 14) +set(BLT_CXX_STD 14) include(blt/SetupBLT.cmake) From 3dcd602ae8916539cea4d6d3f2e786a9cbf98d52 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 9 Mar 2022 09:27:14 -0800 Subject: [PATCH 241/392] Update spack package and remove some gitlab ci jobs --- .gitlab/corona-jobs.yml | 8 +-- .gitlab/lassen-jobs.yml | 8 +-- .gitlab/ruby-jobs.yml | 18 +++--- scripts/spack_packages/raja_perf/package.py | 66 ++++++++++----------- 4 files changed, 48 insertions(+), 52 deletions(-) diff --git a/.gitlab/corona-jobs.yml b/.gitlab/corona-jobs.yml index d5e72f6fe..303141a6b 100644 --- a/.gitlab/corona-jobs.yml +++ b/.gitlab/corona-jobs.yml @@ -10,7 +10,7 @@ hip_4_5_2_clang_13_0_0 (build and test on corona): SPEC: "+rocm~openmp amdgpu_target=gfx906 %clang@13.0.0 cxxflags=--offload-arch=gfx906 ^blt@develop ^hip@4.5.2" extends: .build_and_test_on_corona -hip_4_5_2_clang_13_0_0_desul_atomics (build and test on corona): - variables: - SPEC: "+rocm~openmp +desul amdgpu_target=gfx906 %clang@13.0.0 cxxflags=--offload-arch=gfx906 ^blt@develop ^hip@4.5.2" - extends: .build_and_test_on_corona +#hip_4_5_2_clang_13_0_0_desul_atomics (build and test on corona): +# variables: +# SPEC: "+rocm~openmp +desul amdgpu_target=gfx906 %clang@13.0.0 cxxflags=--offload-arch=gfx906 ^blt@develop ^hip@4.5.2" +# extends: .build_and_test_on_corona diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index 8b5d07099..651c34472 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -89,7 +89,7 @@ clang_9_0_0_memleak (build and test on lassen): ASAN_OPTIONS: "detect_leaks=1" extends: .build_and_test_on_lassen -gcc_8_3_1_cuda_desul_atomics: - variables: - SPEC: "+cuda +desul %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168" - extends: .build_and_test_on_lassen +#gcc_8_3_1_cuda_desul_atomics: +# variables: +# SPEC: "+cuda +desul %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168" +# extends: .build_and_test_on_lassen diff --git a/.gitlab/ruby-jobs.yml b/.gitlab/ruby-jobs.yml index 2b6cceb5c..ca9a8cb45 100644 --- a/.gitlab/ruby-jobs.yml +++ b/.gitlab/ruby-jobs.yml @@ -27,11 +27,11 @@ gcc_8_1_0: # DEFAULT_TIME: 40 # extends: .build_and_test_on_ruby -icpc_18_0_2: - variables: - SPEC: " tests=none %intel@18.0.2" - DEFAULT_TIME: 40 - extends: .build_and_test_on_ruby +#icpc_18_0_2: +# variables: +# SPEC: " tests=none %intel@18.0.2" +# DEFAULT_TIME: 40 +# extends: .build_and_test_on_ruby icpc_19_1_0: variables: @@ -47,7 +47,7 @@ icpc_19_1_0: # DEFAULT_TIME: 60 # extends: .build_and_test_on_ruby -clang_10_desul_atomics: - variables: - SPEC: "+openmp +desul %clang@10.0.1 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" - extends: .build_and_test_on_ruby +#clang_10_desul_atomics: +# variables: +# SPEC: "+openmp +desul %clang@10.0.1 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" +# extends: .build_and_test_on_ruby diff --git a/scripts/spack_packages/raja_perf/package.py b/scripts/spack_packages/raja_perf/package.py index c180bdd46..e7138a562 100644 --- a/scripts/spack_packages/raja_perf/package.py +++ b/scripts/spack_packages/raja_perf/package.py @@ -51,7 +51,7 @@ def get_spec_path(spec, package_name, path_replacements = {}, use_bin = False) : return path -class RajaPerf(CMakePackage, CudaPackage): +class RajaPerf(CMakePackage, CudaPackage, ROCmPackage): """RAJAPerf Suite Framework.""" homepage = "http://software.llnl.gov/RAJAPerf/" @@ -78,10 +78,10 @@ class RajaPerf(CMakePackage, CudaPackage): multi=False, description='Tests to run') depends_on('cmake@3.9:', type='build') - depends_on('cmake@3.9:', when='+cuda', type='build') - depends_on('hip', when='+hip') + depends_on('blt@0.4.1', type='build', when='@main') + depends_on('blt@0.4.1:', type='build') - conflicts('+openmp', when='+hip') + conflicts('+openmp', when='+rocm') conflicts('~openmp', when='+openmp_target', msg='OpenMP target requires OpenMP') phases = ['hostconfig', 'cmake', 'build', 'install'] @@ -244,6 +244,8 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cfg.write(cmake_cache_entry("CMAKE_CUDA_COMPILER", cudacompiler)) + cfg.write(cmake_cache_string("BLT_CXX_STD", "c++14")) + cfg.write(cmake_cache_option("ENABLE_TESTS", False)) if ("xl" in cpp_compiler): cfg.write(cmake_cache_entry("CMAKE_CUDA_FLAGS", "-Xcompiler -O3 -Xcompiler -qxlcompatmacros -Xcompiler -qalias=noansi " + @@ -252,19 +254,17 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cuda_release_flags = "-O3" cuda_reldebinf_flags = "-O3 -g" cuda_debug_flags = "-O0 -g" - cfg.write(cmake_cache_string("BLT_CXX_STD", "c++14")) - cfg.write(cmake_cache_option("ENABLE_TESTS", False)) + + elif ("gcc" in cpp_compiler): + cuda_release_flags = "-O3 -Xcompiler -Ofast -Xcompiler -finline-functions -Xcompiler -finline-limit=20000" + cuda_reldebinf_flags = "-O3 -g -Xcompiler -Ofast -Xcompiler -finline-functions -Xcompiler -finline-limit=20000" + cuda_debug_flags = "-O0 -g -Xcompiler -O0 -Xcompiler -finline-functions -Xcompiler -finline-limit=20000" + else: cuda_release_flags = "-O3 -Xcompiler -Ofast -Xcompiler -finline-functions" cuda_reldebinf_flags = "-O3 -g -Xcompiler -Ofast -Xcompiler -finline-functions" cuda_debug_flags = "-O0 -g -Xcompiler -O0 -Xcompiler -finline-functions" - cfg.write(cmake_cache_string("BLT_CXX_STD", "c++11")) - cfg.write(cmake_cache_option("ENABLE_TESTS", True)) - if ("clang" in cpp_compiler): - cfg.write(cmake_cache_string("BLT_CMAKE_IMPLICIT_LINK_DIRECTORIES_EXCLUDE", - "/usr/tce/packages/gcc/gcc-4.9.3/lib64;/usr/tce/packages/gcc/gcc-4.9.3/lib64/gcc/powerpc64le-unknown-linux-gnu/4.9.3")) - cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELEASE", cuda_release_flags)) cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELWITHDEBINFO", cuda_reldebinf_flags)) cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_DEBUG", cuda_debug_flags)) @@ -276,25 +276,31 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): else: cfg.write(cmake_cache_option("ENABLE_CUDA", False)) - if "+hip" in spec: + if "+rocm" in spec: cfg.write("#------------------{0}\n".format("-" * 60)) cfg.write("# HIP\n") cfg.write("#------------------{0}\n\n".format("-" * 60)) cfg.write(cmake_cache_option("ENABLE_HIP", True)) - cfg.write(cmake_cache_option("ENABLE_TESTS", True)) + cfg.write(cmake_cache_option("ENABLE_TESTS", False)) hip_root = spec['hip'].prefix rocm_root = hip_root + "/.." cfg.write(cmake_cache_entry("HIP_ROOT_DIR", hip_root)) - cfg.write(cmake_cache_entry("HIP_CLANG_PATH", + cfg.write(cmake_cache_entry("ROCM_ROOT_DIR", + rocm_root)) + cfg.write(cmake_cache_entry("HIP_PATH", rocm_root + '/llvm/bin')) - cfg.write(cmake_cache_entry("HIP_HIPCC_FLAGS", - '--amdgpu-target=gfx906')) - cfg.write(cmake_cache_entry("HIP_RUNTIME_INCLUDE_DIRS", - "{0}/include;{0}/../hsa/include".format(hip_root))) - hip_link_flags = "-Wl,--disable-new-dtags -L{0}/lib -L{0}/../lib64 -L{0}/../lib -Wl,-rpath,{0}/lib:{0}/../lib:{0}/../lib64 -lamdhip64 -lhsakmt -lhsa-runtime64".format(hip_root) + cfg.write(cmake_cache_entry("CMAKE_HIP_ARCHITECTURES", 'fx906')) + + hipcc_flags = ['--amdgpu-target=gfx906'] + + cfg.write(cmake_cache_entry("HIP_HIPCC_FLAGS", ';'.join(hipcc_flags))) + + #cfg.write(cmake_cache_entry("HIP_RUNTIME_INCLUDE_DIRS", + # "{0}/include;{0}/../hsa/include".format(hip_root))) + #hip_link_flags = "-Wl,--disable-new-dtags -L{0}/lib -L{0}/../lib64 -L{0}/../lib -Wl,-rpath,{0}/lib:{0}/../lib:{0}/../lib64 -lamdhip64 -lhsakmt -lhsa-runtime64".format(hip_root) if ('%gcc' in spec) or (using_toolchain): if ('%gcc' in spec): gcc_bin = os.path.dirname(self.compiler.cxx) @@ -304,9 +310,9 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cfg.write(cmake_cache_entry("HIP_CLANG_FLAGS", "--gcc-toolchain={0}".format(gcc_prefix))) cfg.write(cmake_cache_entry("CMAKE_EXE_LINKER_FLAGS", - hip_link_flags + " -Wl,-rpath {}/lib64".format(gcc_prefix))) - else: - cfg.write(cmake_cache_entry("CMAKE_EXE_LINKER_FLAGS", hip_link_flags)) + " -Wl,-rpath {}/lib64".format(gcc_prefix))) + #else: + # cfg.write(cmake_cache_entry("CMAKE_EXE_LINKER_FLAGS", hip_link_flags)) else: cfg.write(cmake_cache_option("ENABLE_HIP", False)) @@ -334,18 +340,8 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cfg.write(cmake_cache_option("BUILD_SHARED_LIBS","+shared" in spec)) cfg.write(cmake_cache_option("ENABLE_OPENMP","+openmp" in spec)) - # Note 1: Work around spack adding -march=ppc64le to SPACK_TARGET_ARGS - # which is used by the spack compiler wrapper. This can go away when - # BLT removes -Werror from GTest flags - # Note 2: Tests are either built if variant is set, or if run-tests - # option is passed. - if ("+cuda" in spec) and (self.spec.satisfies('%clang target=ppc64le:')): - cfg.write(cmake_cache_option("ENABLE_TESTS",False)) - if 'tests=benchmarks' in spec or not 'tests=none' in spec: - print("MSG: no testing supported on %clang target=ppc64le:") - else: - cfg.write(cmake_cache_option("ENABLE_BENCHMARKS", 'tests=benchmarks' in spec)) - cfg.write(cmake_cache_option("ENABLE_TESTS", not 'tests=none' in spec or self.run_tests)) + cfg.write(cmake_cache_option("ENABLE_BENCHMARKS", 'tests=benchmarks' in spec)) + cfg.write(cmake_cache_option("ENABLE_TESTS", not 'tests=none' in spec or self.run_tests)) ####################### # Close and save From 2df84d93af47d886ad9122b1febc5cb2982f99d5 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 9 Mar 2022 10:45:56 -0800 Subject: [PATCH 242/392] update blt --- blt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blt b/blt index 223512d34..296bf64e6 160000 --- a/blt +++ b/blt @@ -1 +1 @@ -Subproject commit 223512d349713c071ef9ed2c4ae8b5c22ceabe27 +Subproject commit 296bf64e64edfcfcce6a53e3b396d6529e76b986 From 5cdb072f9ed930dc2f443fc402e5a8446227be55 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 9 Mar 2022 10:59:35 -0800 Subject: [PATCH 243/392] Set CUDA standard to C++14 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d9faf9bf0..80dd84d26 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -95,7 +95,7 @@ set(RAJAPERF_BUILD_SYSTYPE $ENV{SYS_TYPE}) set(RAJAPERF_BUILD_HOST $ENV{HOSTNAME}) if (ENABLE_CUDA) - set(CMAKE_CUDA_STANDARD 11) + set(CMAKE_CUDA_STANDARD 14) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -restrict -arch ${CUDA_ARCH} --expt-extended-lambda --expt-relaxed-constexpr") set(RAJAPERF_COMPILER "${CUDA_NVCC_EXECUTABLE}") From 11e1acec205df03a488950cac762095fbfc18dce Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 9 Mar 2022 11:50:16 -0800 Subject: [PATCH 244/392] update RAJA and submodules to v0.15.0-RC branch --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index 9380c5cbd..648dc73b2 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 9380c5cbdf26f7a1bda9dfe1a47cf0b8be916819 +Subproject commit 648dc73b26221f859d03b34fafa1ab7d6b3b661b From d1991912c13766360b182f33aabe8162903d85aa Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 9 Mar 2022 13:52:32 -0800 Subject: [PATCH 245/392] remove multi_project stage from file (carried over from RAJA) --- .gitlab-ci.yml | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 32d794b64..c855c3add 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -96,21 +96,6 @@ stages: .build_blueos_3_ppc64le_ib_p9_script: extends: .build_blueos_3_ppc64le_ib_script -# If testing develop branch, trigger CHAI pipeline with this version of RAJA. -# TODO: Once spack allows to clone a specific commit on demand, then point to the exact commit. -# This will prevent from sticking to a branch (here develop). -# MP_BRANCH is short for "Multi-Project Branch" and will usually be develop. -trigger-rajaperf: - stage: multi_project - rules: - - if: '$CI_COMMIT_BRANCH == "${MP_BRANCH}" || $MULTI_PROJECT == "ON"' #run only if ... - variables: - UPDATE_RAJA: ${MP_BRANCH} - trigger: - project: radiuss/rajaperf - branch: develop - strategy: depend - # This is where jobs are included. include: - local: .gitlab/ruby-templates.yml From a260720ce1b13e0d131d6b9fbf9495c63fd40f42 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 9 Mar 2022 14:02:36 -0800 Subject: [PATCH 246/392] changes for consistency with working gitlab CI stuff on develop. --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c855c3add..bd630d925 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -50,9 +50,9 @@ stages: - r_build_and_test - r_release_resources - l_build_and_test - - b_build_and_test + - c_allocate_resources - c_build_and_test - - multi_project + - c_release_resources # This is the rules that drives the activation of "advanced" jobs. All advanced # jobs will share this through a template mechanism. From 0d4da746859cc7a1296b2c5a560da5dd1f6a1104 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Thu, 10 Mar 2022 14:37:03 -0800 Subject: [PATCH 247/392] updating the CMakeLists file and blt --- CMakeLists.txt | 7 ++++--- blt | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 80dd84d26..a42e0dcdc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,7 +26,7 @@ set(ENABLE_EXAMPLES Off CACHE BOOL "Enable RAJA examples") set(RAJA_ENABLE_EXERCISES Off CACHE BOOL "Enable RAJA exercises") set(CMAKE_CXX_STANDARD 14) -set(BLT_CXX_STD 14) +set(BLT_CXX_STD c++14) include(blt/SetupBLT.cmake) @@ -40,8 +40,8 @@ cmake_dependent_option(RAJA_PERFSUITE_ENABLE_MPI "Build with MPI" On "ENABLE_MPI # Define RAJA settings... # -set(ENABLE_TESTS Off CACHE BOOL "") -set(ENABLE_EXAMPLES Off CACHE BOOL "") +set(RAJA_ENABLE_TESTS Off CACHE BOOL "") +set(RAJA_ENABLE_EXAMPLES Off CACHE BOOL "") set(ENABLE_DOCUMENTATION Off CACHE BOOL "") set(ENABLE_TBB Off CACHE BOOL "") @@ -59,6 +59,7 @@ add_subdirectory(tpl/RAJA) get_property(RAJA_INCLUDE_DIRS DIRECTORY tpl/RAJA PROPERTY INCLUDE_DIRECTORIES) include_directories(${RAJA_INCLUDE_DIRS}) +set(CAMP_ENABLE_TESTS Off CACHE BOOL "") if (ENABLE_RAJA_SEQUENTIAL) add_definitions(-DRUN_RAJA_SEQ) diff --git a/blt b/blt index 296bf64e6..e35f490a8 160000 --- a/blt +++ b/blt @@ -1 +1 @@ -Subproject commit 296bf64e64edfcfcce6a53e3b396d6529e76b986 +Subproject commit e35f490a8a8b1689e99b5f4308b5251f97eb36cf From 8c2a8d4d28ad6c0ebb1e88fe1714136ba06bfca3 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Thu, 10 Mar 2022 16:08:20 -0800 Subject: [PATCH 248/392] setting blt to v0.5.0 --- blt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blt b/blt index e35f490a8..296bf64e6 160000 --- a/blt +++ b/blt @@ -1 +1 @@ -Subproject commit e35f490a8a8b1689e99b5f4308b5251f97eb36cf +Subproject commit 296bf64e64edfcfcce6a53e3b396d6529e76b986 From f81c01baf0665b59ade0a1b7fc4f8d6813e14a78 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 11 Mar 2022 09:01:34 -0800 Subject: [PATCH 249/392] Comment out suspicious line about include directory 'bin' --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a42e0dcdc..62be98c31 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -116,7 +116,7 @@ endif() configure_file(${CMAKE_SOURCE_DIR}/src/rajaperf_config.hpp.in ${CMAKE_CURRENT_BINARY_DIR}/bin/rajaperf_config.hpp) -include_directories($) +#include_directories($) # Make sure RAJA flag propagate (we need to do some house cleaning to # remove project-specific CMake variables that are no longer needed) From 1387b15b50356594f66b3440dd52043649e90133 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 11 Mar 2022 09:33:34 -0800 Subject: [PATCH 250/392] Undo change from previous commit --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 62be98c31..a42e0dcdc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -116,7 +116,7 @@ endif() configure_file(${CMAKE_SOURCE_DIR}/src/rajaperf_config.hpp.in ${CMAKE_CURRENT_BINARY_DIR}/bin/rajaperf_config.hpp) -#include_directories($) +include_directories($) # Make sure RAJA flag propagate (we need to do some house cleaning to # remove project-specific CMake variables that are no longer needed) From 2693d959b5be40b16a1c7b6b3af6f4354589e52e Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 11 Mar 2022 10:45:11 -0800 Subject: [PATCH 251/392] Fix bin vs. include dir issue --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a42e0dcdc..125c46faf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -114,9 +114,9 @@ else() endif() configure_file(${CMAKE_SOURCE_DIR}/src/rajaperf_config.hpp.in - ${CMAKE_CURRENT_BINARY_DIR}/bin/rajaperf_config.hpp) + ${CMAKE_CURRENT_BINARY_DIR}/include/rajaperf_config.hpp) -include_directories($) +include_directories($) # Make sure RAJA flag propagate (we need to do some house cleaning to # remove project-specific CMake variables that are no longer needed) From 2c1531e89035baac34d9738b9683f3a8057598c7 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 11 Mar 2022 12:29:22 -0800 Subject: [PATCH 252/392] Update gitlab ci compilers and run command --- .gitlab/lassen-jobs.yml | 25 +++++++++++++++---------- scripts/gitlab/build_and_test.sh | 4 ++-- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index 651c34472..fbf08d5ed 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -9,16 +9,21 @@ # CPU ONLY ########## -ibm_clang_9: +ibm_clang_10_0_1: variables: - SPEC: "%clang@ibm.9.0.0" + SPEC: "%clang@ibm.10.0.1" extends: .build_and_test_on_lassen -ibm_clang_9_gcc_8: +ibm_clang_11_0_0: variables: - SPEC: "%clang@ibm.9.0.0 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" + SPEC: "%clang@ibm.11.1.1" extends: .build_and_test_on_lassen +#ibm_clang_9_gcc_8: +# variables: +# SPEC: "%clang@ibm.9.0.0 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" +# extends: .build_and_test_on_lassen + gcc_8_3_1: variables: SPEC: "%gcc@8.3.1 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000'" @@ -40,14 +45,14 @@ xl_16_1_1_11_gcc_8_3_1: # CUDA ########## -ibm_clang_9_cuda: - variables: - SPEC: "+cuda cuda_arch=70 %clang@ibm.9.0.0 ^cuda@10.1.168" - extends: .build_and_test_on_lassen +#ibm_clang_9_cuda: +# variables: +# SPEC: "+cuda cuda_arch=70 %clang@ibm.9.0.0 ^cuda@10.1.168" +# extends: .build_and_test_on_lassen -ibm_clang_10_cuda: +ibm_clang_11_cuda: variables: - SPEC: "+cuda cuda_arch=70 %clang@ibm.10.0.1 ^cuda@10.1.168" + SPEC: "+cuda cuda_arch=70 %clang@ibm.11.0.0 ^cuda@10.1.243" extends: .build_and_test_on_lassen gcc_8_3_1_cuda: diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index b7246db7f..04d7d0aed 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -196,9 +196,9 @@ then echo "./bin/raja-perf.exe -sp" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" else - ./bin/raja-perf.exe --checkrun -sp + ./bin/raja-perf.exe --checkrun 10 -sp echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" - echo "./bin/raja-perf.exe --checkrun -sp" + echo "./bin/raja-perf.exe --checkrun 10 -sp" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" fi fi From e776ea003a51a3f8adfcb95eed0293c8c9b383d8 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 11 Mar 2022 12:54:18 -0800 Subject: [PATCH 253/392] Fix typo in spec --- .gitlab/lassen-jobs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index fbf08d5ed..77843ad04 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -16,7 +16,7 @@ ibm_clang_10_0_1: ibm_clang_11_0_0: variables: - SPEC: "%clang@ibm.11.1.1" + SPEC: "%clang@ibm.11.0.0" extends: .build_and_test_on_lassen #ibm_clang_9_gcc_8: From 922f314e86986e0108f75cbf9bc4db045a965b79 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 11 Mar 2022 13:40:55 -0800 Subject: [PATCH 254/392] Trying to find working compilers for CI.... --- .gitlab/lassen-jobs.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index 77843ad04..8f2ff686d 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -9,14 +9,14 @@ # CPU ONLY ########## -ibm_clang_10_0_1: - variables: - SPEC: "%clang@ibm.10.0.1" - extends: .build_and_test_on_lassen +#ibm_clang_10_0_1: +# variables: +# SPEC: "%clang@ibm.10.0.1" +# extends: .build_and_test_on_lassen -ibm_clang_11_0_0: +clang_11_0_0: variables: - SPEC: "%clang@ibm.11.0.0" + SPEC: "%clang@11.0.0" extends: .build_and_test_on_lassen #ibm_clang_9_gcc_8: @@ -50,9 +50,9 @@ xl_16_1_1_11_gcc_8_3_1: # SPEC: "+cuda cuda_arch=70 %clang@ibm.9.0.0 ^cuda@10.1.168" # extends: .build_and_test_on_lassen -ibm_clang_11_cuda: +clang_11_cuda: variables: - SPEC: "+cuda cuda_arch=70 %clang@ibm.11.0.0 ^cuda@10.1.243" + SPEC: "+cuda cuda_arch=70 %clang@11.0.0 ^cuda@10.1.168" extends: .build_and_test_on_lassen gcc_8_3_1_cuda: From 5fe3678eea13dc6078f6474da9dbf299f10c867f Mon Sep 17 00:00:00 2001 From: David Beckingsale Date: Fri, 11 Mar 2022 13:51:52 -0800 Subject: [PATCH 255/392] Set BLT_EXPORT_THIRDPARTY to avoid issues with imported blt_hip target --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index c2ee0dae2..674e1a96d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -91,7 +91,7 @@ ENV HCC_AMDGPU_TARGET=gfx900 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN . /opt/spack/share/spack/setup-env.sh && spack load hip llvm-amdgpu && \ - cmake -DCMAKE_CXX_COMPILER=amdclang++ -DRAJA_ENABLE_EXTERNAL_ROCPRIM=Off -DHIP_PATH=/opt -DENABLE_HIP=On -DENABLE_CUDA=Off -DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off .. && \ + cmake -DCMAKE_CXX_COMPILER=amdclang++ -DRAJA_ENABLE_EXTERNAL_ROCPRIM=Off -DHIP_PATH=/opt -DENABLE_HIP=On -DENABLE_CUDA=Off -DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off -DBLT_EXPORT_THIRDPARTY=On .. && \ make -j 6 FROM ghcr.io/rse-ops/intel-ubuntu-22.04:intel-2022.0.1 AS sycl From 9dc2be5b979daffd093d777213654b9661f313ad Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 11 Mar 2022 15:45:42 -0800 Subject: [PATCH 256/392] Change run config for clang11 debug to avoid timeout; check openmp --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index c2ee0dae2..8ca594ae5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,7 +9,7 @@ FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-7.3.0 AS gcc7 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_TBB=On -DRAJA_DEPRECATED_TESTS=On .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_TBB=On -DENABLE_OPENMP=On .. && \ make -j 6 &&\ ./bin/raja-perf.exe --checkrun 10 -sp @@ -51,7 +51,7 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug .. && \ make -j 6 &&\ - ./bin/raja-perf.exe --checkrun 10 -sp + ./bin/raja-perf.exe --checkrun -sp FROM ghcr.io/rse-ops/clang-ubuntu-22.04:llvm-13.0.0 AS clang13 ENV GTEST_COLOR=1 From 9c2b79d177b6fa098c069e23ffa86d70ae20cca2 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Sun, 13 Mar 2022 13:06:43 -0700 Subject: [PATCH 257/392] Enable OpenMP for azure builds; print exec. time for basic hah-hah check --- Dockerfile | 36 ++++++++++++++++++------------------ src/common/Executor.cpp | 7 +++++-- 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/Dockerfile b/Dockerfile index a5b0f3531..70a970850 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,47 +9,47 @@ FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-7.3.0 AS gcc7 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_TBB=On -DENABLE_OPENMP=On .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DENABLE_OPENMP=On .. && \ make -j 6 &&\ - ./bin/raja-perf.exe --checkrun 10 -sp + ./bin/raja-perf.exe --checkrun 5 -sp FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-8.1.0 AS gcc8 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_COVERAGE=On -DRAJA_ENABLE_TBB=On .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_OPENMP=On .. && \ make -j 6 &&\ - ./bin/raja-perf.exe --checkrun 10 -sp + ./bin/raja-perf.exe --checkrun 5 -sp FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-9.4.0 AS gcc9 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_TBB=On -DRAJA_ENABLE_RUNTIME_PLUGINS=On .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DENABLE_OPENMP=On .. && \ make -j 6 &&\ - ./bin/raja-perf.exe --checkrun 10 -sp + ./bin/raja-perf.exe --checkrun 5 -sp FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-11.2.0 AS gcc11 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_TBB=On -DRAJA_ENABLE_BOUNDS_CHECK=ON .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DENABLE_OPENMP=On .. && \ make -j 6 &&\ - ./bin/raja-perf.exe --checkrun 10 -sp + ./bin/raja-perf.exe --checkrun 5 -sp FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DRAJA_ENABLE_TBB=On .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DENABLE_OPENMP=On .. && \ make -j 6 &&\ - ./bin/raja-perf.exe --checkrun 10 -sp + ./bin/raja-perf.exe --checkrun 5 -sp FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11-debug ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug -DENABLE_OPENMP=On .. && \ make -j 6 &&\ ./bin/raja-perf.exe --checkrun -sp @@ -57,16 +57,16 @@ FROM ghcr.io/rse-ops/clang-ubuntu-22.04:llvm-13.0.0 AS clang13 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release -DENABLE_OPENMP=On .. && \ make -j 6 &&\ - ./bin/raja-perf.exe --checkrun 10 -sp + ./bin/raja-perf.exe --checkrun 5 -sp FROM ghcr.io/rse-ops/cuda:cuda-10.1.243-ubuntu-18.04 AS nvcc10 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \ - cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 .. && \ + cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \ make -j 4 FROM ghcr.io/rse-ops/cuda-ubuntu-20.04:cuda-11.1.1 AS nvcc11 @@ -74,7 +74,7 @@ ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \ - cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 .. && \ + cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \ make -j 4 FROM ghcr.io/rse-ops/cuda-ubuntu-20.04:cuda-11.1.1 AS nvcc11-debug @@ -82,7 +82,7 @@ ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \ - cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 .. && \ + cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \ make -j 4 FROM ghcr.io/rse-ops/hip-ubuntu-20.04:hip-4.3.1 AS hip @@ -91,7 +91,7 @@ ENV HCC_AMDGPU_TARGET=gfx900 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN . /opt/spack/share/spack/setup-env.sh && spack load hip llvm-amdgpu && \ - cmake -DCMAKE_CXX_COMPILER=amdclang++ -DRAJA_ENABLE_EXTERNAL_ROCPRIM=Off -DHIP_PATH=/opt -DENABLE_HIP=On -DENABLE_CUDA=Off -DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off -DBLT_EXPORT_THIRDPARTY=On .. && \ + cmake -DCMAKE_CXX_COMPILER=amdclang++ -DRAJA_ENABLE_EXTERNAL_ROCPRIM=Off -DHIP_PATH=/opt -DENABLE_HIP=On -DENABLE_CUDA=Off -DENABLE_OPENMP=Off -DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off -DBLT_EXPORT_THIRDPARTY=On .. && \ make -j 6 FROM ghcr.io/rse-ops/intel-ubuntu-22.04:intel-2022.0.1 AS sycl @@ -101,4 +101,4 @@ WORKDIR /home/raja/workspace/build RUN /bin/bash -c "source /opt/view/setvars.sh && \ cmake -DCMAKE_CXX_COMPILER=dpcpp -DRAJA_ENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 .. && \ make -j 6 &&\ - ./bin/raja-perf.exe --checkrun 10 -sp" + ./bin/raja-perf.exe --checkrun 5 -sp" diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 6980ba082..92fabcebc 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -802,10 +802,13 @@ void Executor::runSuite() } else { getCout() << " No "; } - getCout() << getVariantName(vid) << " variant" << endl; + getCout() << getVariantName(vid) << " variant"; } if ( kern->hasVariantDefined(vid) ) { - kernels[ik]->execute(vid); + kern->execute(vid); + getCout() << " -- " << kern->getTotTime(vid) << " sec." << endl; + } else { + getCout() << endl; } } // loop over variants From cc08c3f8fd2b41aa4fed4ee10a5d35741dc03828 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Sun, 13 Mar 2022 13:10:17 -0700 Subject: [PATCH 258/392] Throttle back kernel its. for Mac check --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 2dc813310..027b6264a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -99,6 +99,6 @@ jobs: condition: eq( variables['Agent.OS'], 'Darwin') - script: | cd build - ./bin/raja-perf.exe --checkrun -sp + ./bin/raja-perf.exe --checkrun 5 -sp displayName: 'Run Perf Suite' condition: eq( variables['Agent.OS'], 'Darwin') From 820a2d104d3e734415f97b09eac1a020fc320488 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Sun, 13 Mar 2022 13:47:36 -0700 Subject: [PATCH 259/392] Remove openmp from clang builds, not supported in containers --- Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 70a970850..8aa08d31d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,7 +41,7 @@ FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DENABLE_OPENMP=On .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=clang++ .. && \ make -j 6 &&\ ./bin/raja-perf.exe --checkrun 5 -sp @@ -49,7 +49,7 @@ FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11-debug ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug -DENABLE_OPENMP=On .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug .. && \ make -j 6 &&\ ./bin/raja-perf.exe --checkrun -sp @@ -57,7 +57,7 @@ FROM ghcr.io/rse-ops/clang-ubuntu-22.04:llvm-13.0.0 AS clang13 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release -DENABLE_OPENMP=On .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release .. && \ make -j 6 &&\ ./bin/raja-perf.exe --checkrun 5 -sp From c582d2e354d0232abd5f788134b5acf401d89f84 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 15 Mar 2022 10:47:31 -0700 Subject: [PATCH 260/392] Add unused var, arg warning squashers --- src/common/GPUUtils.hpp | 3 +-- src/rajaperf_config.hpp.in | 7 +++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 9f83cb804..980533faf 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -16,7 +16,6 @@ #include "rajaperf_config.hpp" - namespace rajaperf { @@ -161,7 +160,7 @@ void seq_for(camp::int_seq, Func&& func) { // braced init lists are evaluated in order int seq_unused_array[] = {(func(camp::integral_constant{}), 0)...}; - RAJA_UNUSED_VAR(seq_unused_array); + RAJAPERF_UNUSED_VAR(seq_unused_array); } template void seq_for(Func&& func) diff --git a/src/rajaperf_config.hpp.in b/src/rajaperf_config.hpp.in index 429219185..7565cba9b 100644 --- a/src/rajaperf_config.hpp.in +++ b/src/rajaperf_config.hpp.in @@ -74,4 +74,11 @@ std::string machine_run; } // closing brace for rajaperf namespace +// Squash compiler warnings about unused variables +template < typename ... Ts > +inline void RAJAPERF_UNUSED_VAR(Ts&&...) { } + +// Squash compiler warnings about unused arguments +#define RAJAPERF_UNUSED_ARG(...) + #endif // closing endif for header file include guard From ad69962212bed782c9235cb674381e0d2b5f3608 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 15 Mar 2022 10:59:13 -0700 Subject: [PATCH 261/392] Remove extra seq_for --- src/common/GPUUtils.hpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 980533faf..af892051b 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -156,17 +156,12 @@ using list_type = //compile time loop over an integer sequence //this allows for creating a loop over a compile time constant variable template -void seq_for(camp::int_seq, Func&& func) +inline void seq_for(camp::int_seq const&, Func&& func) { // braced init lists are evaluated in order int seq_unused_array[] = {(func(camp::integral_constant{}), 0)...}; RAJAPERF_UNUSED_VAR(seq_unused_array); } -template -void seq_for(Func&& func) -{ - seq_for(camp::make_int_seq_t{}, std::forward(func)); -} } // closing brace for rajaperf namespace From 451b784303a5867758532b1fa50531caccf52d02 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 15 Mar 2022 11:58:08 -0700 Subject: [PATCH 262/392] Updated copyright year to 2022 --- CMakeLists.txt | 2 +- LICENSE | 2 +- README.md | 6 +++--- RELEASE | 2 +- scripts/install_llvm.sh | 2 +- scripts/lc-builds/blueos_clang.sh | 2 +- scripts/lc-builds/blueos_clang_omptarget.sh | 2 +- scripts/lc-builds/blueos_gcc.sh | 2 +- scripts/lc-builds/blueos_nvcc_clang.sh | 2 +- scripts/lc-builds/blueos_nvcc_gcc.sh | 2 +- scripts/lc-builds/blueos_nvcc_xl.sh | 2 +- scripts/lc-builds/blueos_pgi.sh | 2 +- scripts/lc-builds/blueos_spectrum_nvcc_clang.sh | 2 +- scripts/lc-builds/blueos_xl.sh | 2 +- scripts/lc-builds/blueos_xl_omptarget.sh | 2 +- scripts/lc-builds/toss3_clang.sh | 2 +- scripts/lc-builds/toss3_gcc.sh | 2 +- scripts/lc-builds/toss3_hipcc.sh | 2 +- scripts/lc-builds/toss3_icpc.sh | 2 +- scripts/make_release_tarball.sh | 2 +- scripts/travis_build_and_test.sh | 2 +- scripts/ubuntu-builds/ubuntu_clang.sh | 2 +- scripts/ubuntu-builds/ubuntu_gcc.sh | 2 +- scripts/update_copyright.sh | 8 ++++---- src/CMakeLists.txt | 2 +- src/RAJAPerfSuiteDriver.cpp | 2 +- src/algorithm/CMakeLists.txt | 2 +- src/algorithm/SORT-Cuda.cpp | 2 +- src/algorithm/SORT-Hip.cpp | 2 +- src/algorithm/SORT-OMP.cpp | 2 +- src/algorithm/SORT-Seq.cpp | 2 +- src/algorithm/SORT.cpp | 2 +- src/algorithm/SORT.hpp | 2 +- src/algorithm/SORTPAIRS-Cuda.cpp | 2 +- src/algorithm/SORTPAIRS-Hip.cpp | 2 +- src/algorithm/SORTPAIRS-OMP.cpp | 2 +- src/algorithm/SORTPAIRS-Seq.cpp | 2 +- src/algorithm/SORTPAIRS.cpp | 2 +- src/algorithm/SORTPAIRS.hpp | 2 +- src/apps/AppsData.cpp | 2 +- src/apps/AppsData.hpp | 2 +- src/apps/CMakeLists.txt | 2 +- src/apps/DEL_DOT_VEC_2D-Cuda.cpp | 2 +- src/apps/DEL_DOT_VEC_2D-Hip.cpp | 2 +- src/apps/DEL_DOT_VEC_2D-OMP.cpp | 2 +- src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp | 2 +- src/apps/DEL_DOT_VEC_2D-Seq.cpp | 2 +- src/apps/DEL_DOT_VEC_2D.cpp | 2 +- src/apps/DEL_DOT_VEC_2D.hpp | 2 +- src/apps/DIFFUSION3DPA-Cuda.cpp | 2 +- src/apps/DIFFUSION3DPA-Hip.cpp | 2 +- src/apps/DIFFUSION3DPA-OMP.cpp | 2 +- src/apps/DIFFUSION3DPA-OMPTarget.cpp | 2 +- src/apps/DIFFUSION3DPA-Seq.cpp | 2 +- src/apps/DIFFUSION3DPA.cpp | 2 +- src/apps/DIFFUSION3DPA.hpp | 2 +- src/apps/ENERGY-Cuda.cpp | 2 +- src/apps/ENERGY-Hip.cpp | 2 +- src/apps/ENERGY-OMP.cpp | 2 +- src/apps/ENERGY-OMPTarget.cpp | 2 +- src/apps/ENERGY-Seq.cpp | 2 +- src/apps/ENERGY.cpp | 2 +- src/apps/ENERGY.hpp | 2 +- src/apps/FEM_MACROS.hpp | 2 +- src/apps/FIR-Cuda.cpp | 2 +- src/apps/FIR-Hip.cpp | 2 +- src/apps/FIR-OMP.cpp | 2 +- src/apps/FIR-OMPTarget.cpp | 2 +- src/apps/FIR-Seq.cpp | 2 +- src/apps/FIR.cpp | 2 +- src/apps/FIR.hpp | 2 +- src/apps/HALOEXCHANGE-Cuda.cpp | 2 +- src/apps/HALOEXCHANGE-Hip.cpp | 2 +- src/apps/HALOEXCHANGE-OMP.cpp | 2 +- src/apps/HALOEXCHANGE-OMPTarget.cpp | 2 +- src/apps/HALOEXCHANGE-Seq.cpp | 2 +- src/apps/HALOEXCHANGE.cpp | 2 +- src/apps/HALOEXCHANGE.hpp | 2 +- src/apps/HALOEXCHANGE_FUSED-Cuda.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED-Hip.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED-OMP.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED-Seq.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED.hpp | 2 +- src/apps/LTIMES-Cuda.cpp | 2 +- src/apps/LTIMES-Hip.cpp | 2 +- src/apps/LTIMES-OMP.cpp | 2 +- src/apps/LTIMES-OMPTarget.cpp | 2 +- src/apps/LTIMES-Seq.cpp | 2 +- src/apps/LTIMES.cpp | 2 +- src/apps/LTIMES.hpp | 2 +- src/apps/LTIMES_NOVIEW-Cuda.cpp | 2 +- src/apps/LTIMES_NOVIEW-Hip.cpp | 2 +- src/apps/LTIMES_NOVIEW-OMP.cpp | 2 +- src/apps/LTIMES_NOVIEW-OMPTarget.cpp | 2 +- src/apps/LTIMES_NOVIEW-Seq.cpp | 2 +- src/apps/LTIMES_NOVIEW.cpp | 2 +- src/apps/LTIMES_NOVIEW.hpp | 2 +- src/apps/MASS3DPA-Cuda.cpp | 2 +- src/apps/MASS3DPA-Hip.cpp | 2 +- src/apps/MASS3DPA-OMP.cpp | 2 +- src/apps/MASS3DPA-OMPTarget.cpp | 2 +- src/apps/MASS3DPA-Seq.cpp | 2 +- src/apps/MASS3DPA.cpp | 2 +- src/apps/MASS3DPA.hpp | 2 +- src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D-Hip.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D-OMP.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D-Seq.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D.hpp | 2 +- src/apps/PRESSURE-Cuda.cpp | 2 +- src/apps/PRESSURE-Hip.cpp | 2 +- src/apps/PRESSURE-OMP.cpp | 2 +- src/apps/PRESSURE-OMPTarget.cpp | 2 +- src/apps/PRESSURE-Seq.cpp | 2 +- src/apps/PRESSURE.cpp | 2 +- src/apps/PRESSURE.hpp | 2 +- src/apps/VOL3D-Cuda.cpp | 2 +- src/apps/VOL3D-Hip.cpp | 2 +- src/apps/VOL3D-OMP.cpp | 2 +- src/apps/VOL3D-OMPTarget.cpp | 2 +- src/apps/VOL3D-Seq.cpp | 2 +- src/apps/VOL3D.cpp | 2 +- src/apps/VOL3D.hpp | 2 +- src/apps/WIP-COUPLE.cpp | 2 +- src/apps/WIP-COUPLE.hpp | 2 +- src/basic/CMakeLists.txt | 2 +- src/basic/DAXPY-Cuda.cpp | 2 +- src/basic/DAXPY-Hip.cpp | 2 +- src/basic/DAXPY-OMP.cpp | 2 +- src/basic/DAXPY-OMPTarget.cpp | 2 +- src/basic/DAXPY-Seq.cpp | 2 +- src/basic/DAXPY.cpp | 2 +- src/basic/DAXPY.hpp | 2 +- src/basic/DAXPY_ATOMIC-Cuda.cpp | 2 +- src/basic/DAXPY_ATOMIC-Hip.cpp | 2 +- src/basic/DAXPY_ATOMIC-OMP.cpp | 2 +- src/basic/DAXPY_ATOMIC-OMPTarget.cpp | 2 +- src/basic/DAXPY_ATOMIC-Seq.cpp | 2 +- src/basic/DAXPY_ATOMIC.cpp | 2 +- src/basic/DAXPY_ATOMIC.hpp | 2 +- src/basic/IF_QUAD-Cuda.cpp | 2 +- src/basic/IF_QUAD-Hip.cpp | 2 +- src/basic/IF_QUAD-OMP.cpp | 2 +- src/basic/IF_QUAD-OMPTarget.cpp | 2 +- src/basic/IF_QUAD-Seq.cpp | 2 +- src/basic/IF_QUAD.cpp | 2 +- src/basic/IF_QUAD.hpp | 2 +- src/basic/INIT3-Cuda.cpp | 2 +- src/basic/INIT3-Hip.cpp | 2 +- src/basic/INIT3-OMP.cpp | 2 +- src/basic/INIT3-OMPTarget.cpp | 2 +- src/basic/INIT3-Seq.cpp | 2 +- src/basic/INIT3.cpp | 2 +- src/basic/INIT3.hpp | 2 +- src/basic/INIT_VIEW1D-Cuda.cpp | 2 +- src/basic/INIT_VIEW1D-Hip.cpp | 2 +- src/basic/INIT_VIEW1D-OMP.cpp | 2 +- src/basic/INIT_VIEW1D-OMPTarget.cpp | 2 +- src/basic/INIT_VIEW1D-Seq.cpp | 2 +- src/basic/INIT_VIEW1D.cpp | 2 +- src/basic/INIT_VIEW1D.hpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-Hip.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-OMP.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-Seq.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET.hpp | 2 +- src/basic/MULADDSUB-Cuda.cpp | 2 +- src/basic/MULADDSUB-Hip.cpp | 2 +- src/basic/MULADDSUB-OMP.cpp | 2 +- src/basic/MULADDSUB-OMPTarget.cpp | 2 +- src/basic/MULADDSUB-Seq.cpp | 2 +- src/basic/MULADDSUB.cpp | 2 +- src/basic/MULADDSUB.hpp | 2 +- src/basic/NESTED_INIT-Cuda.cpp | 2 +- src/basic/NESTED_INIT-Hip.cpp | 2 +- src/basic/NESTED_INIT-OMP.cpp | 2 +- src/basic/NESTED_INIT-OMPTarget.cpp | 2 +- src/basic/NESTED_INIT-Seq.cpp | 2 +- src/basic/NESTED_INIT.cpp | 2 +- src/basic/NESTED_INIT.hpp | 2 +- src/basic/PI_ATOMIC-Cuda.cpp | 2 +- src/basic/PI_ATOMIC-Hip.cpp | 2 +- src/basic/PI_ATOMIC-OMP.cpp | 2 +- src/basic/PI_ATOMIC-OMPTarget.cpp | 2 +- src/basic/PI_ATOMIC-Seq.cpp | 2 +- src/basic/PI_ATOMIC.cpp | 2 +- src/basic/PI_ATOMIC.hpp | 2 +- src/basic/PI_REDUCE-Cuda.cpp | 2 +- src/basic/PI_REDUCE-Hip.cpp | 2 +- src/basic/PI_REDUCE-OMP.cpp | 2 +- src/basic/PI_REDUCE-OMPTarget.cpp | 2 +- src/basic/PI_REDUCE-Seq.cpp | 2 +- src/basic/PI_REDUCE.cpp | 2 +- src/basic/PI_REDUCE.hpp | 2 +- src/basic/REDUCE3_INT-Cuda.cpp | 2 +- src/basic/REDUCE3_INT-Hip.cpp | 2 +- src/basic/REDUCE3_INT-OMP.cpp | 2 +- src/basic/REDUCE3_INT-OMPTarget.cpp | 2 +- src/basic/REDUCE3_INT-Seq.cpp | 2 +- src/basic/REDUCE3_INT.cpp | 2 +- src/basic/REDUCE3_INT.hpp | 2 +- src/basic/TRAP_INT-Cuda.cpp | 2 +- src/basic/TRAP_INT-Hip.cpp | 2 +- src/basic/TRAP_INT-OMP.cpp | 2 +- src/basic/TRAP_INT-OMPTarget.cpp | 2 +- src/basic/TRAP_INT-Seq.cpp | 2 +- src/basic/TRAP_INT.cpp | 2 +- src/basic/TRAP_INT.hpp | 2 +- src/common/CMakeLists.txt | 2 +- src/common/CudaDataUtils.hpp | 2 +- src/common/DataUtils.cpp | 2 +- src/common/DataUtils.hpp | 2 +- src/common/Executor.cpp | 2 +- src/common/Executor.hpp | 2 +- src/common/HipDataUtils.hpp | 2 +- src/common/KernelBase.cpp | 2 +- src/common/KernelBase.hpp | 2 +- src/common/OpenMPTargetDataUtils.hpp | 2 +- src/common/OutputUtils.cpp | 2 +- src/common/OutputUtils.hpp | 2 +- src/common/RAJAPerfSuite.cpp | 2 +- src/common/RAJAPerfSuite.hpp | 2 +- src/common/RPTypes.hpp | 2 +- src/common/RunParams.cpp | 2 +- src/common/RunParams.hpp | 2 +- src/lcals/CMakeLists.txt | 2 +- src/lcals/DIFF_PREDICT-Cuda.cpp | 2 +- src/lcals/DIFF_PREDICT-Hip.cpp | 2 +- src/lcals/DIFF_PREDICT-OMP.cpp | 2 +- src/lcals/DIFF_PREDICT-OMPTarget.cpp | 2 +- src/lcals/DIFF_PREDICT-Seq.cpp | 2 +- src/lcals/DIFF_PREDICT.cpp | 2 +- src/lcals/DIFF_PREDICT.hpp | 2 +- src/lcals/EOS-Cuda.cpp | 2 +- src/lcals/EOS-Hip.cpp | 2 +- src/lcals/EOS-OMP.cpp | 2 +- src/lcals/EOS-OMPTarget.cpp | 2 +- src/lcals/EOS-Seq.cpp | 2 +- src/lcals/EOS.cpp | 2 +- src/lcals/EOS.hpp | 2 +- src/lcals/FIRST_DIFF-Cuda.cpp | 2 +- src/lcals/FIRST_DIFF-Hip.cpp | 2 +- src/lcals/FIRST_DIFF-OMP.cpp | 2 +- src/lcals/FIRST_DIFF-OMPTarget.cpp | 2 +- src/lcals/FIRST_DIFF-Seq.cpp | 2 +- src/lcals/FIRST_DIFF.cpp | 2 +- src/lcals/FIRST_DIFF.hpp | 2 +- src/lcals/FIRST_MIN-Cuda.cpp | 2 +- src/lcals/FIRST_MIN-Hip.cpp | 2 +- src/lcals/FIRST_MIN-OMP.cpp | 2 +- src/lcals/FIRST_MIN-OMPTarget.cpp | 2 +- src/lcals/FIRST_MIN-Seq.cpp | 2 +- src/lcals/FIRST_MIN.cpp | 2 +- src/lcals/FIRST_MIN.hpp | 2 +- src/lcals/FIRST_SUM-Cuda.cpp | 2 +- src/lcals/FIRST_SUM-Hip.cpp | 2 +- src/lcals/FIRST_SUM-OMP.cpp | 2 +- src/lcals/FIRST_SUM-OMPTarget.cpp | 2 +- src/lcals/FIRST_SUM-Seq.cpp | 2 +- src/lcals/FIRST_SUM.cpp | 2 +- src/lcals/FIRST_SUM.hpp | 2 +- src/lcals/GEN_LIN_RECUR-Cuda.cpp | 2 +- src/lcals/GEN_LIN_RECUR-Hip.cpp | 2 +- src/lcals/GEN_LIN_RECUR-OMP.cpp | 2 +- src/lcals/GEN_LIN_RECUR-OMPTarget.cpp | 2 +- src/lcals/GEN_LIN_RECUR-Seq.cpp | 2 +- src/lcals/GEN_LIN_RECUR.cpp | 2 +- src/lcals/GEN_LIN_RECUR.hpp | 2 +- src/lcals/HYDRO_1D-Cuda.cpp | 2 +- src/lcals/HYDRO_1D-Hip.cpp | 2 +- src/lcals/HYDRO_1D-OMP.cpp | 2 +- src/lcals/HYDRO_1D-OMPTarget.cpp | 2 +- src/lcals/HYDRO_1D-Seq.cpp | 2 +- src/lcals/HYDRO_1D.cpp | 2 +- src/lcals/HYDRO_1D.hpp | 2 +- src/lcals/HYDRO_2D-Cuda.cpp | 2 +- src/lcals/HYDRO_2D-Hip.cpp | 2 +- src/lcals/HYDRO_2D-OMP.cpp | 2 +- src/lcals/HYDRO_2D-OMPTarget.cpp | 2 +- src/lcals/HYDRO_2D-Seq.cpp | 2 +- src/lcals/HYDRO_2D.cpp | 2 +- src/lcals/HYDRO_2D.hpp | 2 +- src/lcals/INT_PREDICT-Cuda.cpp | 2 +- src/lcals/INT_PREDICT-Hip.cpp | 2 +- src/lcals/INT_PREDICT-OMP.cpp | 2 +- src/lcals/INT_PREDICT-OMPTarget.cpp | 2 +- src/lcals/INT_PREDICT-Seq.cpp | 2 +- src/lcals/INT_PREDICT.cpp | 2 +- src/lcals/INT_PREDICT.hpp | 2 +- src/lcals/PLANCKIAN-Cuda.cpp | 2 +- src/lcals/PLANCKIAN-Hip.cpp | 2 +- src/lcals/PLANCKIAN-OMP.cpp | 2 +- src/lcals/PLANCKIAN-OMPTarget.cpp | 2 +- src/lcals/PLANCKIAN-Seq.cpp | 2 +- src/lcals/PLANCKIAN.cpp | 2 +- src/lcals/PLANCKIAN.hpp | 2 +- src/lcals/TRIDIAG_ELIM-Cuda.cpp | 2 +- src/lcals/TRIDIAG_ELIM-Hip.cpp | 2 +- src/lcals/TRIDIAG_ELIM-OMP.cpp | 2 +- src/lcals/TRIDIAG_ELIM-OMPTarget.cpp | 2 +- src/lcals/TRIDIAG_ELIM-Seq.cpp | 2 +- src/lcals/TRIDIAG_ELIM.cpp | 2 +- src/lcals/TRIDIAG_ELIM.hpp | 2 +- src/polybench/CMakeLists.txt | 2 +- src/polybench/POLYBENCH_2MM-Cuda.cpp | 2 +- src/polybench/POLYBENCH_2MM-Hip.cpp | 2 +- src/polybench/POLYBENCH_2MM-OMP.cpp | 2 +- src/polybench/POLYBENCH_2MM-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_2MM-Seq.cpp | 2 +- src/polybench/POLYBENCH_2MM.cpp | 2 +- src/polybench/POLYBENCH_2MM.hpp | 2 +- src/polybench/POLYBENCH_3MM-Cuda.cpp | 2 +- src/polybench/POLYBENCH_3MM-Hip.cpp | 2 +- src/polybench/POLYBENCH_3MM-OMP.cpp | 2 +- src/polybench/POLYBENCH_3MM-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_3MM-Seq.cpp | 2 +- src/polybench/POLYBENCH_3MM.cpp | 2 +- src/polybench/POLYBENCH_3MM.hpp | 2 +- src/polybench/POLYBENCH_ADI-Cuda.cpp | 2 +- src/polybench/POLYBENCH_ADI-Hip.cpp | 2 +- src/polybench/POLYBENCH_ADI-OMP.cpp | 2 +- src/polybench/POLYBENCH_ADI-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_ADI-Seq.cpp | 2 +- src/polybench/POLYBENCH_ADI.cpp | 2 +- src/polybench/POLYBENCH_ADI.hpp | 2 +- src/polybench/POLYBENCH_ATAX-Cuda.cpp | 2 +- src/polybench/POLYBENCH_ATAX-Hip.cpp | 2 +- src/polybench/POLYBENCH_ATAX-OMP.cpp | 2 +- src/polybench/POLYBENCH_ATAX-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_ATAX-Seq.cpp | 2 +- src/polybench/POLYBENCH_ATAX.cpp | 2 +- src/polybench/POLYBENCH_ATAX.hpp | 2 +- src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D-Hip.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D-OMP.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D-Seq.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D.hpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp | 2 +- src/polybench/POLYBENCH_GEMM-Cuda.cpp | 2 +- src/polybench/POLYBENCH_GEMM-Hip.cpp | 2 +- src/polybench/POLYBENCH_GEMM-OMP.cpp | 2 +- src/polybench/POLYBENCH_GEMM-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_GEMM-Seq.cpp | 2 +- src/polybench/POLYBENCH_GEMM.cpp | 2 +- src/polybench/POLYBENCH_GEMM.hpp | 2 +- src/polybench/POLYBENCH_GEMVER-Cuda.cpp | 2 +- src/polybench/POLYBENCH_GEMVER-Hip.cpp | 2 +- src/polybench/POLYBENCH_GEMVER-OMP.cpp | 2 +- src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_GEMVER-Seq.cpp | 2 +- src/polybench/POLYBENCH_GEMVER.cpp | 2 +- src/polybench/POLYBENCH_GEMVER.hpp | 2 +- src/polybench/POLYBENCH_GESUMMV-Cuda.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV-Hip.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV-OMP.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV-Seq.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV.hpp | 2 +- src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D-Hip.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D-OMP.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D-Seq.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D.hpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D.hpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D.hpp | 2 +- src/polybench/POLYBENCH_MVT-Cuda.cpp | 2 +- src/polybench/POLYBENCH_MVT-Hip.cpp | 2 +- src/polybench/POLYBENCH_MVT-OMP.cpp | 2 +- src/polybench/POLYBENCH_MVT-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_MVT-Seq.cpp | 2 +- src/polybench/POLYBENCH_MVT.cpp | 2 +- src/polybench/POLYBENCH_MVT.hpp | 2 +- src/rajaperf_config.hpp.in | 2 +- src/stream/ADD-Cuda.cpp | 2 +- src/stream/ADD-Hip.cpp | 2 +- src/stream/ADD-OMP.cpp | 2 +- src/stream/ADD-OMPTarget.cpp | 2 +- src/stream/ADD-Seq.cpp | 2 +- src/stream/ADD.cpp | 2 +- src/stream/ADD.hpp | 2 +- src/stream/CMakeLists.txt | 2 +- src/stream/COPY-Cuda.cpp | 2 +- src/stream/COPY-Hip.cpp | 2 +- src/stream/COPY-OMP.cpp | 2 +- src/stream/COPY-OMPTarget.cpp | 2 +- src/stream/COPY-Seq.cpp | 2 +- src/stream/COPY.cpp | 2 +- src/stream/COPY.hpp | 2 +- src/stream/DOT-Cuda.cpp | 2 +- src/stream/DOT-Hip.cpp | 2 +- src/stream/DOT-OMP.cpp | 2 +- src/stream/DOT-OMPTarget.cpp | 2 +- src/stream/DOT-Seq.cpp | 2 +- src/stream/DOT.cpp | 2 +- src/stream/DOT.hpp | 2 +- src/stream/MUL-Cuda.cpp | 2 +- src/stream/MUL-Hip.cpp | 2 +- src/stream/MUL-OMP.cpp | 2 +- src/stream/MUL-OMPTarget.cpp | 2 +- src/stream/MUL-Seq.cpp | 2 +- src/stream/MUL.cpp | 2 +- src/stream/MUL.hpp | 2 +- src/stream/TRIAD-Cuda.cpp | 2 +- src/stream/TRIAD-Hip.cpp | 2 +- src/stream/TRIAD-OMP.cpp | 2 +- src/stream/TRIAD-OMPTarget.cpp | 2 +- src/stream/TRIAD-Seq.cpp | 2 +- src/stream/TRIAD.cpp | 2 +- src/stream/TRIAD.hpp | 2 +- 438 files changed, 443 insertions(+), 443 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 125c46faf..b6d19eaa4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/LICENSE b/LICENSE index f08c6273a..8e4df6528 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2017-2021, Lawrence Livermore National Security, LLC. +Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC. All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/README.md b/README.md index e0ac95b61..161e4c77b 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ [comment]: # (#################################################################) -[comment]: # (Copyright 2017-2021, Lawrence Livermore National Security, LLC) +[comment]: # (Copyright 2017-2022, Lawrence Livermore National Security, LLC) [comment]: # (and RAJA Performance Suite project contributors.) -[comment]: # (See the RAJA/LICENSE file for details.) +[comment]: # (See the RAJAPerf/LICENSE file for details.) [comment]: # [comment]: # (# SPDX-License-Identifier: BSD-3-Clause) [comment]: # (#################################################################) @@ -452,7 +452,7 @@ Here is what a header file for the FOO kernel object should look like: ```cpp //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/RELEASE b/RELEASE index 69efe8772..9096758f6 100644 --- a/RELEASE +++ b/RELEASE @@ -2,7 +2,7 @@ RAJA Performance Suite: ................................, version 0.11.0 -Copyright (c) 2017-2021, Lawrence Livermore National Security, LLC. +Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory. All rights reserved. See details in the RAJAPerf/LICENSE file. diff --git a/scripts/install_llvm.sh b/scripts/install_llvm.sh index 6d1197004..60bfccd39 100755 --- a/scripts/install_llvm.sh +++ b/scripts/install_llvm.sh @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/scripts/lc-builds/blueos_clang.sh b/scripts/lc-builds/blueos_clang.sh index 932108c62..8e071efb8 100755 --- a/scripts/lc-builds/blueos_clang.sh +++ b/scripts/lc-builds/blueos_clang.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/blueos_clang_omptarget.sh b/scripts/lc-builds/blueos_clang_omptarget.sh index cfce68a7f..e182afd54 100755 --- a/scripts/lc-builds/blueos_clang_omptarget.sh +++ b/scripts/lc-builds/blueos_clang_omptarget.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/blueos_gcc.sh b/scripts/lc-builds/blueos_gcc.sh index f3cfbcc94..e0316fe3e 100755 --- a/scripts/lc-builds/blueos_gcc.sh +++ b/scripts/lc-builds/blueos_gcc.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/blueos_nvcc_clang.sh b/scripts/lc-builds/blueos_nvcc_clang.sh index 00d046afa..a9bd4bea1 100755 --- a/scripts/lc-builds/blueos_nvcc_clang.sh +++ b/scripts/lc-builds/blueos_nvcc_clang.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/blueos_nvcc_gcc.sh b/scripts/lc-builds/blueos_nvcc_gcc.sh index 407ccf88b..f29df3506 100755 --- a/scripts/lc-builds/blueos_nvcc_gcc.sh +++ b/scripts/lc-builds/blueos_nvcc_gcc.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/blueos_nvcc_xl.sh b/scripts/lc-builds/blueos_nvcc_xl.sh index 9dc80a283..0f088e38a 100755 --- a/scripts/lc-builds/blueos_nvcc_xl.sh +++ b/scripts/lc-builds/blueos_nvcc_xl.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/blueos_pgi.sh b/scripts/lc-builds/blueos_pgi.sh index a49a546c9..ee1c708be 100755 --- a/scripts/lc-builds/blueos_pgi.sh +++ b/scripts/lc-builds/blueos_pgi.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh b/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh index 33b173b31..516c5403d 100755 --- a/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh +++ b/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/blueos_xl.sh b/scripts/lc-builds/blueos_xl.sh index 6cb6b188d..3f2131871 100755 --- a/scripts/lc-builds/blueos_xl.sh +++ b/scripts/lc-builds/blueos_xl.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/blueos_xl_omptarget.sh b/scripts/lc-builds/blueos_xl_omptarget.sh index 1e54df0be..3a06872f9 100755 --- a/scripts/lc-builds/blueos_xl_omptarget.sh +++ b/scripts/lc-builds/blueos_xl_omptarget.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/toss3_clang.sh b/scripts/lc-builds/toss3_clang.sh index d00d63217..ce140f0ce 100755 --- a/scripts/lc-builds/toss3_clang.sh +++ b/scripts/lc-builds/toss3_clang.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/toss3_gcc.sh b/scripts/lc-builds/toss3_gcc.sh index 2851ea4af..2ab5a987d 100755 --- a/scripts/lc-builds/toss3_gcc.sh +++ b/scripts/lc-builds/toss3_gcc.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/toss3_hipcc.sh b/scripts/lc-builds/toss3_hipcc.sh index e515804ea..c1cca3269 100755 --- a/scripts/lc-builds/toss3_hipcc.sh +++ b/scripts/lc-builds/toss3_hipcc.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/toss3_icpc.sh b/scripts/lc-builds/toss3_icpc.sh index 47ea2c846..225b7d4ab 100755 --- a/scripts/lc-builds/toss3_icpc.sh +++ b/scripts/lc-builds/toss3_icpc.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/make_release_tarball.sh b/scripts/make_release_tarball.sh index cbd4b4fa7..6b0a3b804 100755 --- a/scripts/make_release_tarball.sh +++ b/scripts/make_release_tarball.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/scripts/travis_build_and_test.sh b/scripts/travis_build_and_test.sh index 290cdff13..d53c88550 100755 --- a/scripts/travis_build_and_test.sh +++ b/scripts/travis_build_and_test.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/scripts/ubuntu-builds/ubuntu_clang.sh b/scripts/ubuntu-builds/ubuntu_clang.sh index 7001ddd39..664685f42 100755 --- a/scripts/ubuntu-builds/ubuntu_clang.sh +++ b/scripts/ubuntu-builds/ubuntu_clang.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/ubuntu-builds/ubuntu_gcc.sh b/scripts/ubuntu-builds/ubuntu_gcc.sh index 8f7fc8a16..f0eb7fcf7 100755 --- a/scripts/ubuntu-builds/ubuntu_gcc.sh +++ b/scripts/ubuntu-builds/ubuntu_gcc.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/update_copyright.sh b/scripts/update_copyright.sh index 422f73ea7..8f462663d 100755 --- a/scripts/update_copyright.sh +++ b/scripts/update_copyright.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # @@ -37,7 +37,7 @@ #============================================================================= # First find all the files we want to modify #============================================================================= -find . -type f ! -name \*.git\* ! -name \*update_copyright\* -exec grep -l "the RAJAPerf/COPYRIGHT file" {} \; > files2change +find . -type f ! -name \*.git\* ! -name \*update_copyright\* -exec grep -l "the RAJAPerf/LICENSE file" {} \; > files2change #============================================================================= # Replace the old copyright dates with new dates @@ -46,14 +46,14 @@ for i in `cat files2change` do echo $i cp $i $i.sed.bak - sed "s/Copyright (c) 2017-20/Copyright (c) 2017-21/" $i.sed.bak > $i + sed "s/Copyright (c) 2017-21/Copyright (c) 2017-22/" $i.sed.bak > $i done for i in LICENSE RELEASE README.md do echo $i cp $i $i.sed.bak - sed "s/2017-2020/2017-2021/" $i.sed.bak > $i + sed "s/2017-2021/2017-2022/" $i.sed.bak > $i done #============================================================================= diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 74b945bbb..25306e9fe 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/RAJAPerfSuiteDriver.cpp b/src/RAJAPerfSuiteDriver.cpp index b79670a4e..d423dcff9 100644 --- a/src/RAJAPerfSuiteDriver.cpp +++ b/src/RAJAPerfSuiteDriver.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/CMakeLists.txt b/src/algorithm/CMakeLists.txt index d328a5902..07848730c 100644 --- a/src/algorithm/CMakeLists.txt +++ b/src/algorithm/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/algorithm/SORT-Cuda.cpp b/src/algorithm/SORT-Cuda.cpp index 5870e08b5..bd5dd3692 100644 --- a/src/algorithm/SORT-Cuda.cpp +++ b/src/algorithm/SORT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORT-Hip.cpp b/src/algorithm/SORT-Hip.cpp index edf143cbc..440cdfa7e 100644 --- a/src/algorithm/SORT-Hip.cpp +++ b/src/algorithm/SORT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORT-OMP.cpp b/src/algorithm/SORT-OMP.cpp index a83d956ed..ca1344d04 100644 --- a/src/algorithm/SORT-OMP.cpp +++ b/src/algorithm/SORT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORT-Seq.cpp b/src/algorithm/SORT-Seq.cpp index 4f7094ba6..a88db4e3c 100644 --- a/src/algorithm/SORT-Seq.cpp +++ b/src/algorithm/SORT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORT.cpp b/src/algorithm/SORT.cpp index d9d659482..4abc41551 100644 --- a/src/algorithm/SORT.cpp +++ b/src/algorithm/SORT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORT.hpp b/src/algorithm/SORT.hpp index b266f0d9e..ed85765f2 100644 --- a/src/algorithm/SORT.hpp +++ b/src/algorithm/SORT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORTPAIRS-Cuda.cpp b/src/algorithm/SORTPAIRS-Cuda.cpp index aba1111dc..bb2c5a8fa 100644 --- a/src/algorithm/SORTPAIRS-Cuda.cpp +++ b/src/algorithm/SORTPAIRS-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORTPAIRS-Hip.cpp b/src/algorithm/SORTPAIRS-Hip.cpp index 0850ce650..d7bb95902 100644 --- a/src/algorithm/SORTPAIRS-Hip.cpp +++ b/src/algorithm/SORTPAIRS-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORTPAIRS-OMP.cpp b/src/algorithm/SORTPAIRS-OMP.cpp index 99a432931..79f9cc849 100644 --- a/src/algorithm/SORTPAIRS-OMP.cpp +++ b/src/algorithm/SORTPAIRS-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORTPAIRS-Seq.cpp b/src/algorithm/SORTPAIRS-Seq.cpp index f1f9928b6..eeb95b036 100644 --- a/src/algorithm/SORTPAIRS-Seq.cpp +++ b/src/algorithm/SORTPAIRS-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORTPAIRS.cpp b/src/algorithm/SORTPAIRS.cpp index 7f2e59cbb..06f702f3d 100644 --- a/src/algorithm/SORTPAIRS.cpp +++ b/src/algorithm/SORTPAIRS.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORTPAIRS.hpp b/src/algorithm/SORTPAIRS.hpp index 348b548bd..5519da955 100644 --- a/src/algorithm/SORTPAIRS.hpp +++ b/src/algorithm/SORTPAIRS.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/AppsData.cpp b/src/apps/AppsData.cpp index 26ccdb7d1..390412aa3 100644 --- a/src/apps/AppsData.cpp +++ b/src/apps/AppsData.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/AppsData.hpp b/src/apps/AppsData.hpp index f1616968c..f9b9251e9 100644 --- a/src/apps/AppsData.hpp +++ b/src/apps/AppsData.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index 72cc5b9d4..cf1ed84ea 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp index 894b03bdb..2ebce5b23 100644 --- a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DEL_DOT_VEC_2D-Hip.cpp b/src/apps/DEL_DOT_VEC_2D-Hip.cpp index 25a290abf..56ed6e927 100644 --- a/src/apps/DEL_DOT_VEC_2D-Hip.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DEL_DOT_VEC_2D-OMP.cpp b/src/apps/DEL_DOT_VEC_2D-OMP.cpp index 4232646ad..b3a4dede7 100644 --- a/src/apps/DEL_DOT_VEC_2D-OMP.cpp +++ b/src/apps/DEL_DOT_VEC_2D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp index 9be35bbc6..814b2663d 100644 --- a/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp +++ b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DEL_DOT_VEC_2D-Seq.cpp b/src/apps/DEL_DOT_VEC_2D-Seq.cpp index 208add00b..cc6be401a 100644 --- a/src/apps/DEL_DOT_VEC_2D-Seq.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index cec0af410..bed89ee65 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DEL_DOT_VEC_2D.hpp b/src/apps/DEL_DOT_VEC_2D.hpp index 1a4d7670b..1071ffc49 100644 --- a/src/apps/DEL_DOT_VEC_2D.hpp +++ b/src/apps/DEL_DOT_VEC_2D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index 6ef84bfec..35722eae0 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index 221695dd5..44c18a542 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DIFFUSION3DPA-OMP.cpp b/src/apps/DIFFUSION3DPA-OMP.cpp index 29ff7b108..bd8ebe368 100644 --- a/src/apps/DIFFUSION3DPA-OMP.cpp +++ b/src/apps/DIFFUSION3DPA-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DIFFUSION3DPA-OMPTarget.cpp b/src/apps/DIFFUSION3DPA-OMPTarget.cpp index 8d3368002..b982124eb 100644 --- a/src/apps/DIFFUSION3DPA-OMPTarget.cpp +++ b/src/apps/DIFFUSION3DPA-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index 159f7810b..b49fa7e2b 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp index baf892576..a7ca046b9 100644 --- a/src/apps/DIFFUSION3DPA.cpp +++ b/src/apps/DIFFUSION3DPA.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index 80d034195..bf73769da 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ENERGY-Cuda.cpp b/src/apps/ENERGY-Cuda.cpp index e3ccd6135..b03aca58f 100644 --- a/src/apps/ENERGY-Cuda.cpp +++ b/src/apps/ENERGY-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ENERGY-Hip.cpp b/src/apps/ENERGY-Hip.cpp index 560ae6418..3469a7d29 100644 --- a/src/apps/ENERGY-Hip.cpp +++ b/src/apps/ENERGY-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ENERGY-OMP.cpp b/src/apps/ENERGY-OMP.cpp index 531da1e18..23d35e646 100644 --- a/src/apps/ENERGY-OMP.cpp +++ b/src/apps/ENERGY-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ENERGY-OMPTarget.cpp b/src/apps/ENERGY-OMPTarget.cpp index a9b709ddd..76ef6f055 100644 --- a/src/apps/ENERGY-OMPTarget.cpp +++ b/src/apps/ENERGY-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ENERGY-Seq.cpp b/src/apps/ENERGY-Seq.cpp index 7f13c9805..519b37b3e 100644 --- a/src/apps/ENERGY-Seq.cpp +++ b/src/apps/ENERGY-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp index a6a779f8c..3d3b24030 100644 --- a/src/apps/ENERGY.cpp +++ b/src/apps/ENERGY.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ENERGY.hpp b/src/apps/ENERGY.hpp index 00a45de1d..b1e53dbb1 100644 --- a/src/apps/ENERGY.hpp +++ b/src/apps/ENERGY.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/FEM_MACROS.hpp b/src/apps/FEM_MACROS.hpp index f94a6e1a9..db98f5ff3 100644 --- a/src/apps/FEM_MACROS.hpp +++ b/src/apps/FEM_MACROS.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/FIR-Cuda.cpp b/src/apps/FIR-Cuda.cpp index c6a8da156..afc77dece 100644 --- a/src/apps/FIR-Cuda.cpp +++ b/src/apps/FIR-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/FIR-Hip.cpp b/src/apps/FIR-Hip.cpp index fc9c7bb94..203d94274 100644 --- a/src/apps/FIR-Hip.cpp +++ b/src/apps/FIR-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/FIR-OMP.cpp b/src/apps/FIR-OMP.cpp index 8f011b920..30a3bd965 100644 --- a/src/apps/FIR-OMP.cpp +++ b/src/apps/FIR-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/FIR-OMPTarget.cpp b/src/apps/FIR-OMPTarget.cpp index 0306d8378..ec841b085 100644 --- a/src/apps/FIR-OMPTarget.cpp +++ b/src/apps/FIR-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/FIR-Seq.cpp b/src/apps/FIR-Seq.cpp index 27d2789ad..51b70218b 100644 --- a/src/apps/FIR-Seq.cpp +++ b/src/apps/FIR-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp index fe3993cd9..e76313414 100644 --- a/src/apps/FIR.cpp +++ b/src/apps/FIR.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/FIR.hpp b/src/apps/FIR.hpp index e9b49edcb..c8a0d4c6c 100644 --- a/src/apps/FIR.hpp +++ b/src/apps/FIR.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/HALOEXCHANGE-Cuda.cpp b/src/apps/HALOEXCHANGE-Cuda.cpp index e093b57b7..f6a9750ae 100644 --- a/src/apps/HALOEXCHANGE-Cuda.cpp +++ b/src/apps/HALOEXCHANGE-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/HALOEXCHANGE-Hip.cpp b/src/apps/HALOEXCHANGE-Hip.cpp index a6e1b1ef7..0c0edb449 100644 --- a/src/apps/HALOEXCHANGE-Hip.cpp +++ b/src/apps/HALOEXCHANGE-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/HALOEXCHANGE-OMP.cpp b/src/apps/HALOEXCHANGE-OMP.cpp index 7cac3ca3c..0dac55a06 100644 --- a/src/apps/HALOEXCHANGE-OMP.cpp +++ b/src/apps/HALOEXCHANGE-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/HALOEXCHANGE-OMPTarget.cpp b/src/apps/HALOEXCHANGE-OMPTarget.cpp index 22fe54522..99134b607 100644 --- a/src/apps/HALOEXCHANGE-OMPTarget.cpp +++ b/src/apps/HALOEXCHANGE-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/HALOEXCHANGE-Seq.cpp b/src/apps/HALOEXCHANGE-Seq.cpp index 7a5ae5e17..6994c8f7f 100644 --- a/src/apps/HALOEXCHANGE-Seq.cpp +++ b/src/apps/HALOEXCHANGE-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/HALOEXCHANGE.cpp b/src/apps/HALOEXCHANGE.cpp index db7c7bb90..1e90eec34 100644 --- a/src/apps/HALOEXCHANGE.cpp +++ b/src/apps/HALOEXCHANGE.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/HALOEXCHANGE.hpp b/src/apps/HALOEXCHANGE.hpp index d10bd4790..c6772b92d 100644 --- a/src/apps/HALOEXCHANGE.hpp +++ b/src/apps/HALOEXCHANGE.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp index 21a00b545..83649e6f9 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp index 60b925422..0d6e39e66 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp b/src/apps/HALOEXCHANGE_FUSED-OMP.cpp index 0654cd6dc..3b34e605d 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp index 77323855c..20f527b44 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp b/src/apps/HALOEXCHANGE_FUSED-Seq.cpp index 80eab8629..fe3671059 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/HALOEXCHANGE_FUSED.cpp b/src/apps/HALOEXCHANGE_FUSED.cpp index 6882cf51e..a57212399 100644 --- a/src/apps/HALOEXCHANGE_FUSED.cpp +++ b/src/apps/HALOEXCHANGE_FUSED.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/HALOEXCHANGE_FUSED.hpp b/src/apps/HALOEXCHANGE_FUSED.hpp index 68e81da59..2234a9e33 100644 --- a/src/apps/HALOEXCHANGE_FUSED.hpp +++ b/src/apps/HALOEXCHANGE_FUSED.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES-Cuda.cpp b/src/apps/LTIMES-Cuda.cpp index e59bec74e..bfb65a6b5 100644 --- a/src/apps/LTIMES-Cuda.cpp +++ b/src/apps/LTIMES-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES-Hip.cpp b/src/apps/LTIMES-Hip.cpp index 66da9b051..083dbdd20 100644 --- a/src/apps/LTIMES-Hip.cpp +++ b/src/apps/LTIMES-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES-OMP.cpp b/src/apps/LTIMES-OMP.cpp index 5ba4671a5..6f412a106 100644 --- a/src/apps/LTIMES-OMP.cpp +++ b/src/apps/LTIMES-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES-OMPTarget.cpp b/src/apps/LTIMES-OMPTarget.cpp index 656900895..4de59c6fb 100644 --- a/src/apps/LTIMES-OMPTarget.cpp +++ b/src/apps/LTIMES-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES-Seq.cpp b/src/apps/LTIMES-Seq.cpp index efa06701e..11834a73e 100644 --- a/src/apps/LTIMES-Seq.cpp +++ b/src/apps/LTIMES-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp index c69a2b300..9fafd80ed 100644 --- a/src/apps/LTIMES.cpp +++ b/src/apps/LTIMES.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES.hpp b/src/apps/LTIMES.hpp index 6177873be..5e386647c 100644 --- a/src/apps/LTIMES.hpp +++ b/src/apps/LTIMES.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES_NOVIEW-Cuda.cpp b/src/apps/LTIMES_NOVIEW-Cuda.cpp index eb56fdf79..5bfd52e4d 100644 --- a/src/apps/LTIMES_NOVIEW-Cuda.cpp +++ b/src/apps/LTIMES_NOVIEW-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES_NOVIEW-Hip.cpp b/src/apps/LTIMES_NOVIEW-Hip.cpp index 31ab12979..f42590246 100644 --- a/src/apps/LTIMES_NOVIEW-Hip.cpp +++ b/src/apps/LTIMES_NOVIEW-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES_NOVIEW-OMP.cpp b/src/apps/LTIMES_NOVIEW-OMP.cpp index 7f6fedca2..67e549c7e 100644 --- a/src/apps/LTIMES_NOVIEW-OMP.cpp +++ b/src/apps/LTIMES_NOVIEW-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES_NOVIEW-OMPTarget.cpp b/src/apps/LTIMES_NOVIEW-OMPTarget.cpp index efe3a6ce3..eb78ebd27 100644 --- a/src/apps/LTIMES_NOVIEW-OMPTarget.cpp +++ b/src/apps/LTIMES_NOVIEW-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES_NOVIEW-Seq.cpp b/src/apps/LTIMES_NOVIEW-Seq.cpp index 0f2458ac3..1cbd6cc87 100644 --- a/src/apps/LTIMES_NOVIEW-Seq.cpp +++ b/src/apps/LTIMES_NOVIEW-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp index 5d341cea3..f99023212 100644 --- a/src/apps/LTIMES_NOVIEW.cpp +++ b/src/apps/LTIMES_NOVIEW.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES_NOVIEW.hpp b/src/apps/LTIMES_NOVIEW.hpp index 24c524ecc..09af6ec2d 100644 --- a/src/apps/LTIMES_NOVIEW.hpp +++ b/src/apps/LTIMES_NOVIEW.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp index 2b4a682c4..288886be2 100644 --- a/src/apps/MASS3DPA-Cuda.cpp +++ b/src/apps/MASS3DPA-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index 780a93e41..3a545da0c 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DPA-OMP.cpp b/src/apps/MASS3DPA-OMP.cpp index 812440c26..ca6136272 100644 --- a/src/apps/MASS3DPA-OMP.cpp +++ b/src/apps/MASS3DPA-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DPA-OMPTarget.cpp b/src/apps/MASS3DPA-OMPTarget.cpp index 3ba732420..1857f2ae3 100644 --- a/src/apps/MASS3DPA-OMPTarget.cpp +++ b/src/apps/MASS3DPA-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DPA-Seq.cpp b/src/apps/MASS3DPA-Seq.cpp index 01948159a..47177bb5f 100644 --- a/src/apps/MASS3DPA-Seq.cpp +++ b/src/apps/MASS3DPA-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp index 9ede00404..734b39623 100644 --- a/src/apps/MASS3DPA.cpp +++ b/src/apps/MASS3DPA.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DPA.hpp b/src/apps/MASS3DPA.hpp index 24a64acab..be0df8a3e 100644 --- a/src/apps/MASS3DPA.hpp +++ b/src/apps/MASS3DPA.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp index 3d29a5257..253a89125 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp index f9c2f5fe6..4364774c8 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp b/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp index 9676e1657..a27582f59 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp b/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp index b53dad400..37a183008 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp b/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp index da26a1d68..25effb9b4 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/NODAL_ACCUMULATION_3D.cpp b/src/apps/NODAL_ACCUMULATION_3D.cpp index d0bf52eeb..73a8f4a5c 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/NODAL_ACCUMULATION_3D.hpp b/src/apps/NODAL_ACCUMULATION_3D.hpp index 43ff25703..e2e5e3e0b 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.hpp +++ b/src/apps/NODAL_ACCUMULATION_3D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/PRESSURE-Cuda.cpp b/src/apps/PRESSURE-Cuda.cpp index 325ef4730..b47734d0f 100644 --- a/src/apps/PRESSURE-Cuda.cpp +++ b/src/apps/PRESSURE-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/PRESSURE-Hip.cpp b/src/apps/PRESSURE-Hip.cpp index 2c2f584ef..473d92877 100644 --- a/src/apps/PRESSURE-Hip.cpp +++ b/src/apps/PRESSURE-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/PRESSURE-OMP.cpp b/src/apps/PRESSURE-OMP.cpp index 1393ff89b..be826f3c2 100644 --- a/src/apps/PRESSURE-OMP.cpp +++ b/src/apps/PRESSURE-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/PRESSURE-OMPTarget.cpp b/src/apps/PRESSURE-OMPTarget.cpp index 90212cb30..ff4fcdb08 100644 --- a/src/apps/PRESSURE-OMPTarget.cpp +++ b/src/apps/PRESSURE-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/PRESSURE-Seq.cpp b/src/apps/PRESSURE-Seq.cpp index 1b56941a3..0da19a85d 100644 --- a/src/apps/PRESSURE-Seq.cpp +++ b/src/apps/PRESSURE-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp index b4ef1d72c..a527ff432 100644 --- a/src/apps/PRESSURE.cpp +++ b/src/apps/PRESSURE.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/PRESSURE.hpp b/src/apps/PRESSURE.hpp index 44c6602fa..4a634fc21 100644 --- a/src/apps/PRESSURE.hpp +++ b/src/apps/PRESSURE.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/VOL3D-Cuda.cpp b/src/apps/VOL3D-Cuda.cpp index dc7e47414..2d37d9a34 100644 --- a/src/apps/VOL3D-Cuda.cpp +++ b/src/apps/VOL3D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/VOL3D-Hip.cpp b/src/apps/VOL3D-Hip.cpp index 80a6a9a71..7d3923220 100644 --- a/src/apps/VOL3D-Hip.cpp +++ b/src/apps/VOL3D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/VOL3D-OMP.cpp b/src/apps/VOL3D-OMP.cpp index 276fda6ee..66a699999 100644 --- a/src/apps/VOL3D-OMP.cpp +++ b/src/apps/VOL3D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/VOL3D-OMPTarget.cpp b/src/apps/VOL3D-OMPTarget.cpp index 38771ae62..8f14fedc7 100644 --- a/src/apps/VOL3D-OMPTarget.cpp +++ b/src/apps/VOL3D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/VOL3D-Seq.cpp b/src/apps/VOL3D-Seq.cpp index f2fec0ef8..7caee0f7a 100644 --- a/src/apps/VOL3D-Seq.cpp +++ b/src/apps/VOL3D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp index a8ac3bbc6..183dbe3f9 100644 --- a/src/apps/VOL3D.cpp +++ b/src/apps/VOL3D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/VOL3D.hpp b/src/apps/VOL3D.hpp index 6faf02523..90dd5a4f1 100644 --- a/src/apps/VOL3D.hpp +++ b/src/apps/VOL3D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/WIP-COUPLE.cpp b/src/apps/WIP-COUPLE.cpp index b849f70f9..e1d201c37 100644 --- a/src/apps/WIP-COUPLE.cpp +++ b/src/apps/WIP-COUPLE.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/WIP-COUPLE.hpp b/src/apps/WIP-COUPLE.hpp index e5040ea57..ccecd4c0a 100644 --- a/src/apps/WIP-COUPLE.hpp +++ b/src/apps/WIP-COUPLE.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index 8aa475242..90dd1a7c9 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/basic/DAXPY-Cuda.cpp b/src/basic/DAXPY-Cuda.cpp index 260435e1a..faaa52c4b 100644 --- a/src/basic/DAXPY-Cuda.cpp +++ b/src/basic/DAXPY-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY-Hip.cpp b/src/basic/DAXPY-Hip.cpp index 95cd36fe8..bc2667250 100644 --- a/src/basic/DAXPY-Hip.cpp +++ b/src/basic/DAXPY-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY-OMP.cpp b/src/basic/DAXPY-OMP.cpp index f28c83c7b..872419d43 100644 --- a/src/basic/DAXPY-OMP.cpp +++ b/src/basic/DAXPY-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY-OMPTarget.cpp b/src/basic/DAXPY-OMPTarget.cpp index 930438bbc..fcc668ea8 100644 --- a/src/basic/DAXPY-OMPTarget.cpp +++ b/src/basic/DAXPY-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY-Seq.cpp b/src/basic/DAXPY-Seq.cpp index 2eb2fc690..27b5765c7 100644 --- a/src/basic/DAXPY-Seq.cpp +++ b/src/basic/DAXPY-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp index 16782df2a..db504193e 100644 --- a/src/basic/DAXPY.cpp +++ b/src/basic/DAXPY.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY.hpp b/src/basic/DAXPY.hpp index 9f0688d8a..e0e928c44 100644 --- a/src/basic/DAXPY.hpp +++ b/src/basic/DAXPY.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY_ATOMIC-Cuda.cpp b/src/basic/DAXPY_ATOMIC-Cuda.cpp index 11f230177..9e8465659 100644 --- a/src/basic/DAXPY_ATOMIC-Cuda.cpp +++ b/src/basic/DAXPY_ATOMIC-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY_ATOMIC-Hip.cpp b/src/basic/DAXPY_ATOMIC-Hip.cpp index 5035c51d9..ac1ab795b 100644 --- a/src/basic/DAXPY_ATOMIC-Hip.cpp +++ b/src/basic/DAXPY_ATOMIC-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY_ATOMIC-OMP.cpp b/src/basic/DAXPY_ATOMIC-OMP.cpp index d12bd99b9..d1e6c8d9d 100644 --- a/src/basic/DAXPY_ATOMIC-OMP.cpp +++ b/src/basic/DAXPY_ATOMIC-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY_ATOMIC-OMPTarget.cpp b/src/basic/DAXPY_ATOMIC-OMPTarget.cpp index e4beb5920..81326074f 100644 --- a/src/basic/DAXPY_ATOMIC-OMPTarget.cpp +++ b/src/basic/DAXPY_ATOMIC-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY_ATOMIC-Seq.cpp b/src/basic/DAXPY_ATOMIC-Seq.cpp index 01f56305a..8cc480307 100644 --- a/src/basic/DAXPY_ATOMIC-Seq.cpp +++ b/src/basic/DAXPY_ATOMIC-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY_ATOMIC.cpp b/src/basic/DAXPY_ATOMIC.cpp index e5545ca8b..1d34fc230 100644 --- a/src/basic/DAXPY_ATOMIC.cpp +++ b/src/basic/DAXPY_ATOMIC.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY_ATOMIC.hpp b/src/basic/DAXPY_ATOMIC.hpp index b557812e4..027d56e3a 100644 --- a/src/basic/DAXPY_ATOMIC.hpp +++ b/src/basic/DAXPY_ATOMIC.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/IF_QUAD-Cuda.cpp b/src/basic/IF_QUAD-Cuda.cpp index 8790bbcbb..d7ad66b22 100644 --- a/src/basic/IF_QUAD-Cuda.cpp +++ b/src/basic/IF_QUAD-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/IF_QUAD-Hip.cpp b/src/basic/IF_QUAD-Hip.cpp index 7e903e086..3037a6a4a 100644 --- a/src/basic/IF_QUAD-Hip.cpp +++ b/src/basic/IF_QUAD-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/IF_QUAD-OMP.cpp b/src/basic/IF_QUAD-OMP.cpp index 517814c6c..d65bad011 100644 --- a/src/basic/IF_QUAD-OMP.cpp +++ b/src/basic/IF_QUAD-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/IF_QUAD-OMPTarget.cpp b/src/basic/IF_QUAD-OMPTarget.cpp index e44711ecb..c33449684 100644 --- a/src/basic/IF_QUAD-OMPTarget.cpp +++ b/src/basic/IF_QUAD-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/IF_QUAD-Seq.cpp b/src/basic/IF_QUAD-Seq.cpp index a5e9a9c6c..720fa9326 100644 --- a/src/basic/IF_QUAD-Seq.cpp +++ b/src/basic/IF_QUAD-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp index 2baff8244..79fe8bd7b 100644 --- a/src/basic/IF_QUAD.cpp +++ b/src/basic/IF_QUAD.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/IF_QUAD.hpp b/src/basic/IF_QUAD.hpp index dad204ce3..eb59f8739 100644 --- a/src/basic/IF_QUAD.hpp +++ b/src/basic/IF_QUAD.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT3-Cuda.cpp b/src/basic/INIT3-Cuda.cpp index fd9ae6a35..f3dc8cdd7 100644 --- a/src/basic/INIT3-Cuda.cpp +++ b/src/basic/INIT3-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT3-Hip.cpp b/src/basic/INIT3-Hip.cpp index 9c7ce1ad3..7f823d2ca 100644 --- a/src/basic/INIT3-Hip.cpp +++ b/src/basic/INIT3-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT3-OMP.cpp b/src/basic/INIT3-OMP.cpp index b5bd688e0..12a9a1dcf 100644 --- a/src/basic/INIT3-OMP.cpp +++ b/src/basic/INIT3-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT3-OMPTarget.cpp b/src/basic/INIT3-OMPTarget.cpp index ff7d69e63..92fab64bf 100644 --- a/src/basic/INIT3-OMPTarget.cpp +++ b/src/basic/INIT3-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT3-Seq.cpp b/src/basic/INIT3-Seq.cpp index 66b8f5a7c..c944d126c 100644 --- a/src/basic/INIT3-Seq.cpp +++ b/src/basic/INIT3-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp index cb3c14132..3258b6db4 100644 --- a/src/basic/INIT3.cpp +++ b/src/basic/INIT3.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT3.hpp b/src/basic/INIT3.hpp index 9d9de78da..62dcd1190 100644 --- a/src/basic/INIT3.hpp +++ b/src/basic/INIT3.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D-Cuda.cpp b/src/basic/INIT_VIEW1D-Cuda.cpp index 5612bbc79..b05e65793 100644 --- a/src/basic/INIT_VIEW1D-Cuda.cpp +++ b/src/basic/INIT_VIEW1D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D-Hip.cpp b/src/basic/INIT_VIEW1D-Hip.cpp index 316fe8fda..b47df0e26 100644 --- a/src/basic/INIT_VIEW1D-Hip.cpp +++ b/src/basic/INIT_VIEW1D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D-OMP.cpp b/src/basic/INIT_VIEW1D-OMP.cpp index 42cc23c39..37baa27b2 100644 --- a/src/basic/INIT_VIEW1D-OMP.cpp +++ b/src/basic/INIT_VIEW1D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D-OMPTarget.cpp b/src/basic/INIT_VIEW1D-OMPTarget.cpp index 8ec0d5c13..0506075b6 100644 --- a/src/basic/INIT_VIEW1D-OMPTarget.cpp +++ b/src/basic/INIT_VIEW1D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D-Seq.cpp b/src/basic/INIT_VIEW1D-Seq.cpp index 419e1698b..c8803eaed 100644 --- a/src/basic/INIT_VIEW1D-Seq.cpp +++ b/src/basic/INIT_VIEW1D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp index bad47eae8..e21835228 100644 --- a/src/basic/INIT_VIEW1D.cpp +++ b/src/basic/INIT_VIEW1D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D.hpp b/src/basic/INIT_VIEW1D.hpp index b215439dc..ccb43b392 100644 --- a/src/basic/INIT_VIEW1D.hpp +++ b/src/basic/INIT_VIEW1D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp index 8a5f62e2b..34d38521e 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp index d0353c15c..611646b07 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp b/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp index 1e07407aa..9298a0513 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp index 9b3c3cdae..190fdb4a6 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp b/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp index f53872d14..c66122d88 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp index 06519f61b..762f70a72 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D_OFFSET.hpp b/src/basic/INIT_VIEW1D_OFFSET.hpp index 333139909..e384e69e7 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.hpp +++ b/src/basic/INIT_VIEW1D_OFFSET.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/MULADDSUB-Cuda.cpp b/src/basic/MULADDSUB-Cuda.cpp index 955ebbf67..751753307 100644 --- a/src/basic/MULADDSUB-Cuda.cpp +++ b/src/basic/MULADDSUB-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/MULADDSUB-Hip.cpp b/src/basic/MULADDSUB-Hip.cpp index 8f7794258..06eb3aa6a 100644 --- a/src/basic/MULADDSUB-Hip.cpp +++ b/src/basic/MULADDSUB-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/MULADDSUB-OMP.cpp b/src/basic/MULADDSUB-OMP.cpp index 79d441bcc..6e7406b77 100644 --- a/src/basic/MULADDSUB-OMP.cpp +++ b/src/basic/MULADDSUB-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/MULADDSUB-OMPTarget.cpp b/src/basic/MULADDSUB-OMPTarget.cpp index bca0164e3..22d03da73 100644 --- a/src/basic/MULADDSUB-OMPTarget.cpp +++ b/src/basic/MULADDSUB-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/MULADDSUB-Seq.cpp b/src/basic/MULADDSUB-Seq.cpp index e6fb9f913..6c632e8f3 100644 --- a/src/basic/MULADDSUB-Seq.cpp +++ b/src/basic/MULADDSUB-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp index baa201dc1..ca813f096 100644 --- a/src/basic/MULADDSUB.cpp +++ b/src/basic/MULADDSUB.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/MULADDSUB.hpp b/src/basic/MULADDSUB.hpp index afb0a5f38..da62adb70 100644 --- a/src/basic/MULADDSUB.hpp +++ b/src/basic/MULADDSUB.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/NESTED_INIT-Cuda.cpp b/src/basic/NESTED_INIT-Cuda.cpp index fd68d76cd..eea0d08b7 100644 --- a/src/basic/NESTED_INIT-Cuda.cpp +++ b/src/basic/NESTED_INIT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/NESTED_INIT-Hip.cpp b/src/basic/NESTED_INIT-Hip.cpp index cc66556f7..2bf29b6d7 100644 --- a/src/basic/NESTED_INIT-Hip.cpp +++ b/src/basic/NESTED_INIT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/NESTED_INIT-OMP.cpp b/src/basic/NESTED_INIT-OMP.cpp index a38521976..554739e9c 100644 --- a/src/basic/NESTED_INIT-OMP.cpp +++ b/src/basic/NESTED_INIT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/NESTED_INIT-OMPTarget.cpp b/src/basic/NESTED_INIT-OMPTarget.cpp index b140ede6a..f269c12dc 100644 --- a/src/basic/NESTED_INIT-OMPTarget.cpp +++ b/src/basic/NESTED_INIT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/NESTED_INIT-Seq.cpp b/src/basic/NESTED_INIT-Seq.cpp index 578a544e0..843970f8a 100644 --- a/src/basic/NESTED_INIT-Seq.cpp +++ b/src/basic/NESTED_INIT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp index 77d847691..20b667faf 100644 --- a/src/basic/NESTED_INIT.cpp +++ b/src/basic/NESTED_INIT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/NESTED_INIT.hpp b/src/basic/NESTED_INIT.hpp index 508ba8030..15f1a077b 100644 --- a/src/basic/NESTED_INIT.hpp +++ b/src/basic/NESTED_INIT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_ATOMIC-Cuda.cpp b/src/basic/PI_ATOMIC-Cuda.cpp index 889af65a8..cdf30a580 100644 --- a/src/basic/PI_ATOMIC-Cuda.cpp +++ b/src/basic/PI_ATOMIC-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp index 9b5bead53..a366cfc48 100644 --- a/src/basic/PI_ATOMIC-Hip.cpp +++ b/src/basic/PI_ATOMIC-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_ATOMIC-OMP.cpp b/src/basic/PI_ATOMIC-OMP.cpp index 555e22826..e779533a0 100644 --- a/src/basic/PI_ATOMIC-OMP.cpp +++ b/src/basic/PI_ATOMIC-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_ATOMIC-OMPTarget.cpp b/src/basic/PI_ATOMIC-OMPTarget.cpp index 0d6443423..10e3fa20b 100644 --- a/src/basic/PI_ATOMIC-OMPTarget.cpp +++ b/src/basic/PI_ATOMIC-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_ATOMIC-Seq.cpp b/src/basic/PI_ATOMIC-Seq.cpp index 0fee34737..9f00fa86e 100644 --- a/src/basic/PI_ATOMIC-Seq.cpp +++ b/src/basic/PI_ATOMIC-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp index 94e29c8ae..fbbae3144 100644 --- a/src/basic/PI_ATOMIC.cpp +++ b/src/basic/PI_ATOMIC.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_ATOMIC.hpp b/src/basic/PI_ATOMIC.hpp index 9c71d2d70..daec7c79f 100644 --- a/src/basic/PI_ATOMIC.hpp +++ b/src/basic/PI_ATOMIC.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index 1c8897fe8..4ef6517b8 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index a7a22482d..47c91afd3 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_REDUCE-OMP.cpp b/src/basic/PI_REDUCE-OMP.cpp index 56a7c59cd..7531b4b9e 100644 --- a/src/basic/PI_REDUCE-OMP.cpp +++ b/src/basic/PI_REDUCE-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_REDUCE-OMPTarget.cpp b/src/basic/PI_REDUCE-OMPTarget.cpp index 4f4870b5c..32c023a03 100644 --- a/src/basic/PI_REDUCE-OMPTarget.cpp +++ b/src/basic/PI_REDUCE-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_REDUCE-Seq.cpp b/src/basic/PI_REDUCE-Seq.cpp index 577ebfb6b..bad2c3a43 100644 --- a/src/basic/PI_REDUCE-Seq.cpp +++ b/src/basic/PI_REDUCE-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_REDUCE.cpp b/src/basic/PI_REDUCE.cpp index b7032e61d..cab3876cd 100644 --- a/src/basic/PI_REDUCE.cpp +++ b/src/basic/PI_REDUCE.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp index 59ea5321a..0dc89c521 100644 --- a/src/basic/PI_REDUCE.hpp +++ b/src/basic/PI_REDUCE.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index a4bba12c6..f328d3909 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index c8955fc18..656c8d637 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE3_INT-OMP.cpp b/src/basic/REDUCE3_INT-OMP.cpp index bb22a3ce8..707f0ae8b 100644 --- a/src/basic/REDUCE3_INT-OMP.cpp +++ b/src/basic/REDUCE3_INT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE3_INT-OMPTarget.cpp b/src/basic/REDUCE3_INT-OMPTarget.cpp index 08f184510..93e19d236 100644 --- a/src/basic/REDUCE3_INT-OMPTarget.cpp +++ b/src/basic/REDUCE3_INT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE3_INT-Seq.cpp b/src/basic/REDUCE3_INT-Seq.cpp index 7170cecd7..333e2236a 100644 --- a/src/basic/REDUCE3_INT-Seq.cpp +++ b/src/basic/REDUCE3_INT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp index 821a4b7e3..4371bf352 100644 --- a/src/basic/REDUCE3_INT.cpp +++ b/src/basic/REDUCE3_INT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp index b3acc5004..7eec522a3 100644 --- a/src/basic/REDUCE3_INT.hpp +++ b/src/basic/REDUCE3_INT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index 21b95fc81..5c245726f 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index b378a5504..b1e386cf6 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/TRAP_INT-OMP.cpp b/src/basic/TRAP_INT-OMP.cpp index 19017fa76..4d84f0a33 100644 --- a/src/basic/TRAP_INT-OMP.cpp +++ b/src/basic/TRAP_INT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/TRAP_INT-OMPTarget.cpp b/src/basic/TRAP_INT-OMPTarget.cpp index ad5717923..f0ec20119 100644 --- a/src/basic/TRAP_INT-OMPTarget.cpp +++ b/src/basic/TRAP_INT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/TRAP_INT-Seq.cpp b/src/basic/TRAP_INT-Seq.cpp index ba411e513..55672f19b 100644 --- a/src/basic/TRAP_INT-Seq.cpp +++ b/src/basic/TRAP_INT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp index e7483d9f4..f23a13012 100644 --- a/src/basic/TRAP_INT.cpp +++ b/src/basic/TRAP_INT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp index 171d72418..9f4af7a03 100644 --- a/src/basic/TRAP_INT.hpp +++ b/src/basic/TRAP_INT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index a673d2e43..0e459fa62 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index 0467d0f19..5dedd1c04 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index 49da5c23d..a73b87340 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index 7708eeb08..f42d3eb75 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 92fabcebc..665bed27a 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp index f2645a3c2..e236497ab 100644 --- a/src/common/Executor.hpp +++ b/src/common/Executor.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index 550563d2f..22d627059 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index ea21e5e68..2d808003e 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 6f2e40338..b007e8b34 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/OpenMPTargetDataUtils.hpp b/src/common/OpenMPTargetDataUtils.hpp index 88fa6d759..fc36cef1f 100644 --- a/src/common/OpenMPTargetDataUtils.hpp +++ b/src/common/OpenMPTargetDataUtils.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/OutputUtils.cpp b/src/common/OutputUtils.cpp index 5aea14855..6542b8952 100644 --- a/src/common/OutputUtils.cpp +++ b/src/common/OutputUtils.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/OutputUtils.hpp b/src/common/OutputUtils.hpp index fc034a147..6ba77a408 100644 --- a/src/common/OutputUtils.hpp +++ b/src/common/OutputUtils.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 84a85e311..826f00b22 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index e9391ad3c..4a822ba6e 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/RPTypes.hpp b/src/common/RPTypes.hpp index 31a49abf6..d9a2865b3 100644 --- a/src/common/RPTypes.hpp +++ b/src/common/RPTypes.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index b9fb49d26..fb1890451 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index 0ea2296ad..75c115576 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/CMakeLists.txt b/src/lcals/CMakeLists.txt index 90cc8bc9a..5f88c8c69 100644 --- a/src/lcals/CMakeLists.txt +++ b/src/lcals/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/lcals/DIFF_PREDICT-Cuda.cpp b/src/lcals/DIFF_PREDICT-Cuda.cpp index 94303ca97..c61f751ab 100644 --- a/src/lcals/DIFF_PREDICT-Cuda.cpp +++ b/src/lcals/DIFF_PREDICT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/DIFF_PREDICT-Hip.cpp b/src/lcals/DIFF_PREDICT-Hip.cpp index 850a3ab1f..66d884519 100644 --- a/src/lcals/DIFF_PREDICT-Hip.cpp +++ b/src/lcals/DIFF_PREDICT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/DIFF_PREDICT-OMP.cpp b/src/lcals/DIFF_PREDICT-OMP.cpp index ebe91fe92..35c6c243e 100644 --- a/src/lcals/DIFF_PREDICT-OMP.cpp +++ b/src/lcals/DIFF_PREDICT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/DIFF_PREDICT-OMPTarget.cpp b/src/lcals/DIFF_PREDICT-OMPTarget.cpp index 60e6c45e7..f7bb61fa9 100644 --- a/src/lcals/DIFF_PREDICT-OMPTarget.cpp +++ b/src/lcals/DIFF_PREDICT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/DIFF_PREDICT-Seq.cpp b/src/lcals/DIFF_PREDICT-Seq.cpp index bf74477e7..01e619260 100644 --- a/src/lcals/DIFF_PREDICT-Seq.cpp +++ b/src/lcals/DIFF_PREDICT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp index d1a96a101..89523eacf 100644 --- a/src/lcals/DIFF_PREDICT.cpp +++ b/src/lcals/DIFF_PREDICT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/DIFF_PREDICT.hpp b/src/lcals/DIFF_PREDICT.hpp index 504dd8bd7..5204a9d3b 100644 --- a/src/lcals/DIFF_PREDICT.hpp +++ b/src/lcals/DIFF_PREDICT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/EOS-Cuda.cpp b/src/lcals/EOS-Cuda.cpp index ac630c028..9305a34df 100644 --- a/src/lcals/EOS-Cuda.cpp +++ b/src/lcals/EOS-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/EOS-Hip.cpp b/src/lcals/EOS-Hip.cpp index 36e0daf88..b3008b171 100644 --- a/src/lcals/EOS-Hip.cpp +++ b/src/lcals/EOS-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/EOS-OMP.cpp b/src/lcals/EOS-OMP.cpp index c5a8c8490..52630e539 100644 --- a/src/lcals/EOS-OMP.cpp +++ b/src/lcals/EOS-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/EOS-OMPTarget.cpp b/src/lcals/EOS-OMPTarget.cpp index b0f2fe008..c8ccd8918 100644 --- a/src/lcals/EOS-OMPTarget.cpp +++ b/src/lcals/EOS-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/EOS-Seq.cpp b/src/lcals/EOS-Seq.cpp index 66f308c2b..2673d3741 100644 --- a/src/lcals/EOS-Seq.cpp +++ b/src/lcals/EOS-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp index 4a8671172..76f24529e 100644 --- a/src/lcals/EOS.cpp +++ b/src/lcals/EOS.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/EOS.hpp b/src/lcals/EOS.hpp index 82a779ac2..6a2cf5d98 100644 --- a/src/lcals/EOS.hpp +++ b/src/lcals/EOS.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_DIFF-Cuda.cpp b/src/lcals/FIRST_DIFF-Cuda.cpp index d24afedc0..8f845f519 100644 --- a/src/lcals/FIRST_DIFF-Cuda.cpp +++ b/src/lcals/FIRST_DIFF-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_DIFF-Hip.cpp b/src/lcals/FIRST_DIFF-Hip.cpp index 1cdf0cd15..008d4f54d 100644 --- a/src/lcals/FIRST_DIFF-Hip.cpp +++ b/src/lcals/FIRST_DIFF-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_DIFF-OMP.cpp b/src/lcals/FIRST_DIFF-OMP.cpp index ae2a2e995..e77a2aa73 100644 --- a/src/lcals/FIRST_DIFF-OMP.cpp +++ b/src/lcals/FIRST_DIFF-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_DIFF-OMPTarget.cpp b/src/lcals/FIRST_DIFF-OMPTarget.cpp index 0688731d4..1712f93c8 100644 --- a/src/lcals/FIRST_DIFF-OMPTarget.cpp +++ b/src/lcals/FIRST_DIFF-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_DIFF-Seq.cpp b/src/lcals/FIRST_DIFF-Seq.cpp index 62b43af09..02aa579f4 100644 --- a/src/lcals/FIRST_DIFF-Seq.cpp +++ b/src/lcals/FIRST_DIFF-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp index c37c41aac..834ae3380 100644 --- a/src/lcals/FIRST_DIFF.cpp +++ b/src/lcals/FIRST_DIFF.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_DIFF.hpp b/src/lcals/FIRST_DIFF.hpp index 21c279b89..5484f9a6b 100644 --- a/src/lcals/FIRST_DIFF.hpp +++ b/src/lcals/FIRST_DIFF.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp index 17ef21e59..a3c966b11 100644 --- a/src/lcals/FIRST_MIN-Cuda.cpp +++ b/src/lcals/FIRST_MIN-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp index 9024275d6..95236893d 100644 --- a/src/lcals/FIRST_MIN-Hip.cpp +++ b/src/lcals/FIRST_MIN-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_MIN-OMP.cpp b/src/lcals/FIRST_MIN-OMP.cpp index 9ebc5f326..b59dc2744 100644 --- a/src/lcals/FIRST_MIN-OMP.cpp +++ b/src/lcals/FIRST_MIN-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_MIN-OMPTarget.cpp b/src/lcals/FIRST_MIN-OMPTarget.cpp index 274e9affe..c382775e1 100644 --- a/src/lcals/FIRST_MIN-OMPTarget.cpp +++ b/src/lcals/FIRST_MIN-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_MIN-Seq.cpp b/src/lcals/FIRST_MIN-Seq.cpp index fec75aadc..46d8902e8 100644 --- a/src/lcals/FIRST_MIN-Seq.cpp +++ b/src/lcals/FIRST_MIN-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp index c6138e46a..2275d5fa2 100644 --- a/src/lcals/FIRST_MIN.cpp +++ b/src/lcals/FIRST_MIN.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_MIN.hpp b/src/lcals/FIRST_MIN.hpp index a9b48c1b3..b56b3e71e 100644 --- a/src/lcals/FIRST_MIN.hpp +++ b/src/lcals/FIRST_MIN.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_SUM-Cuda.cpp b/src/lcals/FIRST_SUM-Cuda.cpp index a5198111c..aa63bf620 100644 --- a/src/lcals/FIRST_SUM-Cuda.cpp +++ b/src/lcals/FIRST_SUM-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_SUM-Hip.cpp b/src/lcals/FIRST_SUM-Hip.cpp index d94a4b1e3..e4556a72e 100644 --- a/src/lcals/FIRST_SUM-Hip.cpp +++ b/src/lcals/FIRST_SUM-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_SUM-OMP.cpp b/src/lcals/FIRST_SUM-OMP.cpp index b9905666a..6ec640dbb 100644 --- a/src/lcals/FIRST_SUM-OMP.cpp +++ b/src/lcals/FIRST_SUM-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_SUM-OMPTarget.cpp b/src/lcals/FIRST_SUM-OMPTarget.cpp index 19344df4c..53b04691a 100644 --- a/src/lcals/FIRST_SUM-OMPTarget.cpp +++ b/src/lcals/FIRST_SUM-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_SUM-Seq.cpp b/src/lcals/FIRST_SUM-Seq.cpp index cbb96a695..3b3ac7eb9 100644 --- a/src/lcals/FIRST_SUM-Seq.cpp +++ b/src/lcals/FIRST_SUM-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp index ceaa9bc8b..56d9d8a29 100644 --- a/src/lcals/FIRST_SUM.cpp +++ b/src/lcals/FIRST_SUM.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_SUM.hpp b/src/lcals/FIRST_SUM.hpp index d828ac896..20a8db5b1 100644 --- a/src/lcals/FIRST_SUM.hpp +++ b/src/lcals/FIRST_SUM.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/GEN_LIN_RECUR-Cuda.cpp b/src/lcals/GEN_LIN_RECUR-Cuda.cpp index 40b5b3d15..dd456f15d 100644 --- a/src/lcals/GEN_LIN_RECUR-Cuda.cpp +++ b/src/lcals/GEN_LIN_RECUR-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/GEN_LIN_RECUR-Hip.cpp b/src/lcals/GEN_LIN_RECUR-Hip.cpp index bbe147f2e..163f21950 100644 --- a/src/lcals/GEN_LIN_RECUR-Hip.cpp +++ b/src/lcals/GEN_LIN_RECUR-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/GEN_LIN_RECUR-OMP.cpp b/src/lcals/GEN_LIN_RECUR-OMP.cpp index 087fc82f3..001dd9bad 100644 --- a/src/lcals/GEN_LIN_RECUR-OMP.cpp +++ b/src/lcals/GEN_LIN_RECUR-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp b/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp index 7cb0fd72b..ef3fcb000 100644 --- a/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp +++ b/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/GEN_LIN_RECUR-Seq.cpp b/src/lcals/GEN_LIN_RECUR-Seq.cpp index f7ed58734..b04491114 100644 --- a/src/lcals/GEN_LIN_RECUR-Seq.cpp +++ b/src/lcals/GEN_LIN_RECUR-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/GEN_LIN_RECUR.cpp b/src/lcals/GEN_LIN_RECUR.cpp index 6534633da..fadc3d838 100644 --- a/src/lcals/GEN_LIN_RECUR.cpp +++ b/src/lcals/GEN_LIN_RECUR.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/GEN_LIN_RECUR.hpp b/src/lcals/GEN_LIN_RECUR.hpp index 3fa49e69f..7341355e9 100644 --- a/src/lcals/GEN_LIN_RECUR.hpp +++ b/src/lcals/GEN_LIN_RECUR.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_1D-Cuda.cpp b/src/lcals/HYDRO_1D-Cuda.cpp index 4d71069e1..5a8fdbe30 100644 --- a/src/lcals/HYDRO_1D-Cuda.cpp +++ b/src/lcals/HYDRO_1D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_1D-Hip.cpp b/src/lcals/HYDRO_1D-Hip.cpp index 93f3f5042..305308392 100644 --- a/src/lcals/HYDRO_1D-Hip.cpp +++ b/src/lcals/HYDRO_1D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_1D-OMP.cpp b/src/lcals/HYDRO_1D-OMP.cpp index 94390485f..27815f672 100644 --- a/src/lcals/HYDRO_1D-OMP.cpp +++ b/src/lcals/HYDRO_1D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_1D-OMPTarget.cpp b/src/lcals/HYDRO_1D-OMPTarget.cpp index c313951b5..f92c0fb5f 100644 --- a/src/lcals/HYDRO_1D-OMPTarget.cpp +++ b/src/lcals/HYDRO_1D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_1D-Seq.cpp b/src/lcals/HYDRO_1D-Seq.cpp index a257b08bb..b74276d8a 100644 --- a/src/lcals/HYDRO_1D-Seq.cpp +++ b/src/lcals/HYDRO_1D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp index 08198ca0f..75cfe7807 100644 --- a/src/lcals/HYDRO_1D.cpp +++ b/src/lcals/HYDRO_1D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_1D.hpp b/src/lcals/HYDRO_1D.hpp index 029065be8..e292433fb 100644 --- a/src/lcals/HYDRO_1D.hpp +++ b/src/lcals/HYDRO_1D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_2D-Cuda.cpp b/src/lcals/HYDRO_2D-Cuda.cpp index d9afba50d..7e0028826 100644 --- a/src/lcals/HYDRO_2D-Cuda.cpp +++ b/src/lcals/HYDRO_2D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_2D-Hip.cpp b/src/lcals/HYDRO_2D-Hip.cpp index 785a94c09..dc9ee8e52 100644 --- a/src/lcals/HYDRO_2D-Hip.cpp +++ b/src/lcals/HYDRO_2D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_2D-OMP.cpp b/src/lcals/HYDRO_2D-OMP.cpp index d22502523..703e551ce 100644 --- a/src/lcals/HYDRO_2D-OMP.cpp +++ b/src/lcals/HYDRO_2D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_2D-OMPTarget.cpp b/src/lcals/HYDRO_2D-OMPTarget.cpp index ccac11396..09b81241a 100644 --- a/src/lcals/HYDRO_2D-OMPTarget.cpp +++ b/src/lcals/HYDRO_2D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_2D-Seq.cpp b/src/lcals/HYDRO_2D-Seq.cpp index 023819bec..d731b6ea0 100644 --- a/src/lcals/HYDRO_2D-Seq.cpp +++ b/src/lcals/HYDRO_2D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp index e51237f82..b01c483c7 100644 --- a/src/lcals/HYDRO_2D.cpp +++ b/src/lcals/HYDRO_2D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_2D.hpp b/src/lcals/HYDRO_2D.hpp index 2525c8c89..c4fde9b22 100644 --- a/src/lcals/HYDRO_2D.hpp +++ b/src/lcals/HYDRO_2D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/INT_PREDICT-Cuda.cpp b/src/lcals/INT_PREDICT-Cuda.cpp index 5772a8e9c..d380a2583 100644 --- a/src/lcals/INT_PREDICT-Cuda.cpp +++ b/src/lcals/INT_PREDICT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/INT_PREDICT-Hip.cpp b/src/lcals/INT_PREDICT-Hip.cpp index 7a55f20ca..87074d6e5 100644 --- a/src/lcals/INT_PREDICT-Hip.cpp +++ b/src/lcals/INT_PREDICT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/INT_PREDICT-OMP.cpp b/src/lcals/INT_PREDICT-OMP.cpp index 486d9178e..580a0730f 100644 --- a/src/lcals/INT_PREDICT-OMP.cpp +++ b/src/lcals/INT_PREDICT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/INT_PREDICT-OMPTarget.cpp b/src/lcals/INT_PREDICT-OMPTarget.cpp index 4fcc54307..61d86a183 100644 --- a/src/lcals/INT_PREDICT-OMPTarget.cpp +++ b/src/lcals/INT_PREDICT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/INT_PREDICT-Seq.cpp b/src/lcals/INT_PREDICT-Seq.cpp index 61032c6db..760af1689 100644 --- a/src/lcals/INT_PREDICT-Seq.cpp +++ b/src/lcals/INT_PREDICT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp index 096a074ac..db4bc88d6 100644 --- a/src/lcals/INT_PREDICT.cpp +++ b/src/lcals/INT_PREDICT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/INT_PREDICT.hpp b/src/lcals/INT_PREDICT.hpp index 1253e1a6e..af681d341 100644 --- a/src/lcals/INT_PREDICT.hpp +++ b/src/lcals/INT_PREDICT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/PLANCKIAN-Cuda.cpp b/src/lcals/PLANCKIAN-Cuda.cpp index fe210a699..f41ecfdb9 100644 --- a/src/lcals/PLANCKIAN-Cuda.cpp +++ b/src/lcals/PLANCKIAN-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/PLANCKIAN-Hip.cpp b/src/lcals/PLANCKIAN-Hip.cpp index c2e566ad8..bea37012b 100644 --- a/src/lcals/PLANCKIAN-Hip.cpp +++ b/src/lcals/PLANCKIAN-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/PLANCKIAN-OMP.cpp b/src/lcals/PLANCKIAN-OMP.cpp index 8a890654e..2244a9515 100644 --- a/src/lcals/PLANCKIAN-OMP.cpp +++ b/src/lcals/PLANCKIAN-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/PLANCKIAN-OMPTarget.cpp b/src/lcals/PLANCKIAN-OMPTarget.cpp index f6471f7b6..d239ceaca 100644 --- a/src/lcals/PLANCKIAN-OMPTarget.cpp +++ b/src/lcals/PLANCKIAN-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/PLANCKIAN-Seq.cpp b/src/lcals/PLANCKIAN-Seq.cpp index 88bcc04e0..b514fc534 100644 --- a/src/lcals/PLANCKIAN-Seq.cpp +++ b/src/lcals/PLANCKIAN-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp index 564a71a7e..0ba212a6b 100644 --- a/src/lcals/PLANCKIAN.cpp +++ b/src/lcals/PLANCKIAN.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/PLANCKIAN.hpp b/src/lcals/PLANCKIAN.hpp index 1e5b744db..1fc3bdd45 100644 --- a/src/lcals/PLANCKIAN.hpp +++ b/src/lcals/PLANCKIAN.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/TRIDIAG_ELIM-Cuda.cpp b/src/lcals/TRIDIAG_ELIM-Cuda.cpp index 276ec54a0..9f9aa41a7 100644 --- a/src/lcals/TRIDIAG_ELIM-Cuda.cpp +++ b/src/lcals/TRIDIAG_ELIM-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/TRIDIAG_ELIM-Hip.cpp b/src/lcals/TRIDIAG_ELIM-Hip.cpp index 0a16664d0..113329b86 100644 --- a/src/lcals/TRIDIAG_ELIM-Hip.cpp +++ b/src/lcals/TRIDIAG_ELIM-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/TRIDIAG_ELIM-OMP.cpp b/src/lcals/TRIDIAG_ELIM-OMP.cpp index 0bd108fb1..259551418 100644 --- a/src/lcals/TRIDIAG_ELIM-OMP.cpp +++ b/src/lcals/TRIDIAG_ELIM-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp b/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp index 39cb585d5..b35e2c63f 100644 --- a/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp +++ b/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/TRIDIAG_ELIM-Seq.cpp b/src/lcals/TRIDIAG_ELIM-Seq.cpp index b3bf160ab..45bf3c136 100644 --- a/src/lcals/TRIDIAG_ELIM-Seq.cpp +++ b/src/lcals/TRIDIAG_ELIM-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp index d35c08a51..dd7f5e430 100644 --- a/src/lcals/TRIDIAG_ELIM.cpp +++ b/src/lcals/TRIDIAG_ELIM.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/TRIDIAG_ELIM.hpp b/src/lcals/TRIDIAG_ELIM.hpp index 73ffeb341..ecbdf6ec4 100644 --- a/src/lcals/TRIDIAG_ELIM.hpp +++ b/src/lcals/TRIDIAG_ELIM.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/CMakeLists.txt b/src/polybench/CMakeLists.txt index fec49e204..5805926f3 100644 --- a/src/polybench/CMakeLists.txt +++ b/src/polybench/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/polybench/POLYBENCH_2MM-Cuda.cpp b/src/polybench/POLYBENCH_2MM-Cuda.cpp index 8e546379e..95fa66bd3 100644 --- a/src/polybench/POLYBENCH_2MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_2MM-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_2MM-Hip.cpp b/src/polybench/POLYBENCH_2MM-Hip.cpp index 455ab55b2..23b652fce 100644 --- a/src/polybench/POLYBENCH_2MM-Hip.cpp +++ b/src/polybench/POLYBENCH_2MM-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_2MM-OMP.cpp b/src/polybench/POLYBENCH_2MM-OMP.cpp index a7778840a..4eb6e1ae4 100644 --- a/src/polybench/POLYBENCH_2MM-OMP.cpp +++ b/src/polybench/POLYBENCH_2MM-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_2MM-OMPTarget.cpp b/src/polybench/POLYBENCH_2MM-OMPTarget.cpp index ce689e767..7ee738c23 100644 --- a/src/polybench/POLYBENCH_2MM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_2MM-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_2MM-Seq.cpp b/src/polybench/POLYBENCH_2MM-Seq.cpp index 4eae8f13c..edd3b0dbf 100644 --- a/src/polybench/POLYBENCH_2MM-Seq.cpp +++ b/src/polybench/POLYBENCH_2MM-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index 7e2083c50..dd9eab576 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_2MM.hpp b/src/polybench/POLYBENCH_2MM.hpp index 897eb13a3..8f10b8ad2 100644 --- a/src/polybench/POLYBENCH_2MM.hpp +++ b/src/polybench/POLYBENCH_2MM.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_3MM-Cuda.cpp b/src/polybench/POLYBENCH_3MM-Cuda.cpp index e861ed010..23d4629e1 100644 --- a/src/polybench/POLYBENCH_3MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_3MM-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_3MM-Hip.cpp b/src/polybench/POLYBENCH_3MM-Hip.cpp index 717a025da..faf02f3a0 100644 --- a/src/polybench/POLYBENCH_3MM-Hip.cpp +++ b/src/polybench/POLYBENCH_3MM-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_3MM-OMP.cpp b/src/polybench/POLYBENCH_3MM-OMP.cpp index 7fe78a498..161c15412 100644 --- a/src/polybench/POLYBENCH_3MM-OMP.cpp +++ b/src/polybench/POLYBENCH_3MM-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_3MM-OMPTarget.cpp b/src/polybench/POLYBENCH_3MM-OMPTarget.cpp index f7380fabd..617073c90 100644 --- a/src/polybench/POLYBENCH_3MM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_3MM-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_3MM-Seq.cpp b/src/polybench/POLYBENCH_3MM-Seq.cpp index 0659026cc..928213534 100644 --- a/src/polybench/POLYBENCH_3MM-Seq.cpp +++ b/src/polybench/POLYBENCH_3MM-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp index 2c06a72ac..95b10612c 100644 --- a/src/polybench/POLYBENCH_3MM.cpp +++ b/src/polybench/POLYBENCH_3MM.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_3MM.hpp b/src/polybench/POLYBENCH_3MM.hpp index 80d0a2fe5..ed664da76 100644 --- a/src/polybench/POLYBENCH_3MM.hpp +++ b/src/polybench/POLYBENCH_3MM.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ADI-Cuda.cpp b/src/polybench/POLYBENCH_ADI-Cuda.cpp index 2843c77f3..6e489008c 100644 --- a/src/polybench/POLYBENCH_ADI-Cuda.cpp +++ b/src/polybench/POLYBENCH_ADI-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ADI-Hip.cpp b/src/polybench/POLYBENCH_ADI-Hip.cpp index 6c9874006..e9632a178 100644 --- a/src/polybench/POLYBENCH_ADI-Hip.cpp +++ b/src/polybench/POLYBENCH_ADI-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ADI-OMP.cpp b/src/polybench/POLYBENCH_ADI-OMP.cpp index 42edab15b..b2b740e72 100644 --- a/src/polybench/POLYBENCH_ADI-OMP.cpp +++ b/src/polybench/POLYBENCH_ADI-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ADI-OMPTarget.cpp b/src/polybench/POLYBENCH_ADI-OMPTarget.cpp index 0d04bb597..ff2894519 100644 --- a/src/polybench/POLYBENCH_ADI-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_ADI-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ADI-Seq.cpp b/src/polybench/POLYBENCH_ADI-Seq.cpp index b4f8e82c3..7a50e6d51 100644 --- a/src/polybench/POLYBENCH_ADI-Seq.cpp +++ b/src/polybench/POLYBENCH_ADI-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp index c36b41050..f8bfd83c7 100644 --- a/src/polybench/POLYBENCH_ADI.cpp +++ b/src/polybench/POLYBENCH_ADI.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ADI.hpp b/src/polybench/POLYBENCH_ADI.hpp index bec422925..655d42208 100644 --- a/src/polybench/POLYBENCH_ADI.hpp +++ b/src/polybench/POLYBENCH_ADI.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ATAX-Cuda.cpp b/src/polybench/POLYBENCH_ATAX-Cuda.cpp index 69fcf77a3..32a4c0028 100644 --- a/src/polybench/POLYBENCH_ATAX-Cuda.cpp +++ b/src/polybench/POLYBENCH_ATAX-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ATAX-Hip.cpp b/src/polybench/POLYBENCH_ATAX-Hip.cpp index f9d532e48..c433f5738 100644 --- a/src/polybench/POLYBENCH_ATAX-Hip.cpp +++ b/src/polybench/POLYBENCH_ATAX-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ATAX-OMP.cpp b/src/polybench/POLYBENCH_ATAX-OMP.cpp index cb78dd1cc..a04c7a2c7 100644 --- a/src/polybench/POLYBENCH_ATAX-OMP.cpp +++ b/src/polybench/POLYBENCH_ATAX-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp b/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp index 7f9b96a75..ab7d5b8e5 100644 --- a/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ATAX-Seq.cpp b/src/polybench/POLYBENCH_ATAX-Seq.cpp index 5f6d018b6..0250dd90e 100644 --- a/src/polybench/POLYBENCH_ATAX-Seq.cpp +++ b/src/polybench/POLYBENCH_ATAX-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp index e06917239..5b0db9530 100644 --- a/src/polybench/POLYBENCH_ATAX.cpp +++ b/src/polybench/POLYBENCH_ATAX.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ATAX.hpp b/src/polybench/POLYBENCH_ATAX.hpp index d2c5ec63e..85a17b347 100644 --- a/src/polybench/POLYBENCH_ATAX.hpp +++ b/src/polybench/POLYBENCH_ATAX.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp index f26754de3..83e03c6b1 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp index 74488952e..03cb7eff6 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp b/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp index ebd027d6f..bae0887a5 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp b/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp index d260754bf..3e588aac9 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp b/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp index 58461dd8a..93d2ee3aa 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp index 59e03721c..f218dc9fa 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FDTD_2D.hpp b/src/polybench/POLYBENCH_FDTD_2D.hpp index a1ead28b2..240224dcf 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.hpp +++ b/src/polybench/POLYBENCH_FDTD_2D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp index 6a49073b8..cdcf2b391 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp index 853ab1023..a786552fc 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp index f3cfc0466..dcd63acf5 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp index c2e864b93..4818a8640 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp index cfe5ef88e..ef85aec52 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp index b3306a992..bf438030e 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp index ec2bcab9f..af3e72c74 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMM-Cuda.cpp b/src/polybench/POLYBENCH_GEMM-Cuda.cpp index 994395c39..0db3e1c45 100644 --- a/src/polybench/POLYBENCH_GEMM-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMM-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMM-Hip.cpp b/src/polybench/POLYBENCH_GEMM-Hip.cpp index a4044ccb2..a150c9f4d 100644 --- a/src/polybench/POLYBENCH_GEMM-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMM-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMM-OMP.cpp b/src/polybench/POLYBENCH_GEMM-OMP.cpp index 9195af832..7a1268498 100644 --- a/src/polybench/POLYBENCH_GEMM-OMP.cpp +++ b/src/polybench/POLYBENCH_GEMM-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp b/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp index 0570e3ad3..0079c7be1 100644 --- a/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMM-Seq.cpp b/src/polybench/POLYBENCH_GEMM-Seq.cpp index e28973b2b..4e67adcf7 100644 --- a/src/polybench/POLYBENCH_GEMM-Seq.cpp +++ b/src/polybench/POLYBENCH_GEMM-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp index a50ac09da..845454f09 100644 --- a/src/polybench/POLYBENCH_GEMM.cpp +++ b/src/polybench/POLYBENCH_GEMM.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMM.hpp b/src/polybench/POLYBENCH_GEMM.hpp index dd9e4a5a7..ed3a96e8e 100644 --- a/src/polybench/POLYBENCH_GEMM.hpp +++ b/src/polybench/POLYBENCH_GEMM.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp index 5e2bae18e..076f4b4e9 100644 --- a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMVER-Hip.cpp b/src/polybench/POLYBENCH_GEMVER-Hip.cpp index 4ff4dafc9..50624e4ca 100644 --- a/src/polybench/POLYBENCH_GEMVER-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMVER-OMP.cpp b/src/polybench/POLYBENCH_GEMVER-OMP.cpp index 3b5d911b9..3ceabd107 100644 --- a/src/polybench/POLYBENCH_GEMVER-OMP.cpp +++ b/src/polybench/POLYBENCH_GEMVER-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp b/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp index 5b256729e..b3aa2ca62 100644 --- a/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMVER-Seq.cpp b/src/polybench/POLYBENCH_GEMVER-Seq.cpp index 096dd9d56..2ae4d55ff 100644 --- a/src/polybench/POLYBENCH_GEMVER-Seq.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp index fce83907a..8a02a341c 100644 --- a/src/polybench/POLYBENCH_GEMVER.cpp +++ b/src/polybench/POLYBENCH_GEMVER.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMVER.hpp b/src/polybench/POLYBENCH_GEMVER.hpp index 919f18e5c..8970420cb 100644 --- a/src/polybench/POLYBENCH_GEMVER.hpp +++ b/src/polybench/POLYBENCH_GEMVER.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp index b9f29ce28..7f96e2d1f 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp index e4e30ce76..67fb13d0f 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GESUMMV-OMP.cpp b/src/polybench/POLYBENCH_GESUMMV-OMP.cpp index 4fc4896ed..5797b525a 100644 --- a/src/polybench/POLYBENCH_GESUMMV-OMP.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp b/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp index 299d4b347..b4c1848ec 100644 --- a/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GESUMMV-Seq.cpp b/src/polybench/POLYBENCH_GESUMMV-Seq.cpp index 4a488029b..ae9b29a54 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Seq.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp index 39cb94510..d15388134 100644 --- a/src/polybench/POLYBENCH_GESUMMV.cpp +++ b/src/polybench/POLYBENCH_GESUMMV.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GESUMMV.hpp b/src/polybench/POLYBENCH_GESUMMV.hpp index c8cc9e191..bf2374d96 100644 --- a/src/polybench/POLYBENCH_GESUMMV.hpp +++ b/src/polybench/POLYBENCH_GESUMMV.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp index 17eb3bb88..72d72db42 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp index 048d2ab43..12c7e0057 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp b/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp index 7bf354c65..75e1305df 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp b/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp index bc6fe97aa..d387bf483 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp b/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp index 2a93d39b8..449b23ff1 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp index 85fd0ce38..fe3f1d199 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_HEAT_3D.hpp b/src/polybench/POLYBENCH_HEAT_3D.hpp index b21b56576..a2ac7c807 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.hpp +++ b/src/polybench/POLYBENCH_HEAT_3D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp index c961478c5..75911ca69 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp index eb7d4e1bb..2d3484b69 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp b/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp index 683cea5ba..f0c035e96 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp b/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp index d281ff310..241a60e6b 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp index a022c2981..91342e448 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp index 48c064780..abc0f56fb 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_1D.hpp b/src/polybench/POLYBENCH_JACOBI_1D.hpp index 290e26ce0..9e772501d 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp index 83b476abf..d71b1b615 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp index e58d7ea27..bf89cb2f6 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp b/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp index 69e6e1d13..7ad1999bf 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp b/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp index 36afb8957..b372f98d7 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp index 87e8e8e15..5113ca8e2 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp index 9e204bdab..67354c8fa 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_2D.hpp b/src/polybench/POLYBENCH_JACOBI_2D.hpp index 9a57325a1..30bcf33e2 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_MVT-Cuda.cpp b/src/polybench/POLYBENCH_MVT-Cuda.cpp index 67c678be7..40256fab5 100644 --- a/src/polybench/POLYBENCH_MVT-Cuda.cpp +++ b/src/polybench/POLYBENCH_MVT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_MVT-Hip.cpp b/src/polybench/POLYBENCH_MVT-Hip.cpp index 488bc8885..4d61cecb4 100644 --- a/src/polybench/POLYBENCH_MVT-Hip.cpp +++ b/src/polybench/POLYBENCH_MVT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_MVT-OMP.cpp b/src/polybench/POLYBENCH_MVT-OMP.cpp index 0023d0684..f05766218 100644 --- a/src/polybench/POLYBENCH_MVT-OMP.cpp +++ b/src/polybench/POLYBENCH_MVT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_MVT-OMPTarget.cpp b/src/polybench/POLYBENCH_MVT-OMPTarget.cpp index 8eb198ea5..63135dcff 100644 --- a/src/polybench/POLYBENCH_MVT-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_MVT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_MVT-Seq.cpp b/src/polybench/POLYBENCH_MVT-Seq.cpp index 80847c383..5b54aa82c 100644 --- a/src/polybench/POLYBENCH_MVT-Seq.cpp +++ b/src/polybench/POLYBENCH_MVT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp index ae2749ce5..daab8e4df 100644 --- a/src/polybench/POLYBENCH_MVT.cpp +++ b/src/polybench/POLYBENCH_MVT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_MVT.hpp b/src/polybench/POLYBENCH_MVT.hpp index cb72784ed..3701a8595 100644 --- a/src/polybench/POLYBENCH_MVT.hpp +++ b/src/polybench/POLYBENCH_MVT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/rajaperf_config.hpp.in b/src/rajaperf_config.hpp.in index 808993af4..8f8aaeaa8 100644 --- a/src/rajaperf_config.hpp.in +++ b/src/rajaperf_config.hpp.in @@ -9,7 +9,7 @@ */ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/ADD-Cuda.cpp b/src/stream/ADD-Cuda.cpp index 8dab511cd..caa2cb737 100644 --- a/src/stream/ADD-Cuda.cpp +++ b/src/stream/ADD-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/ADD-Hip.cpp b/src/stream/ADD-Hip.cpp index 24f8dadf8..f15e06474 100644 --- a/src/stream/ADD-Hip.cpp +++ b/src/stream/ADD-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/ADD-OMP.cpp b/src/stream/ADD-OMP.cpp index 137ce77a6..322b2429e 100644 --- a/src/stream/ADD-OMP.cpp +++ b/src/stream/ADD-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/ADD-OMPTarget.cpp b/src/stream/ADD-OMPTarget.cpp index 9c367b1b0..ff00935dc 100644 --- a/src/stream/ADD-OMPTarget.cpp +++ b/src/stream/ADD-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/ADD-Seq.cpp b/src/stream/ADD-Seq.cpp index 89f989d95..657923d21 100644 --- a/src/stream/ADD-Seq.cpp +++ b/src/stream/ADD-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp index 200172e60..b44ef64fd 100644 --- a/src/stream/ADD.cpp +++ b/src/stream/ADD.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/ADD.hpp b/src/stream/ADD.hpp index 0bf45b810..07b4aff67 100644 --- a/src/stream/ADD.hpp +++ b/src/stream/ADD.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/CMakeLists.txt b/src/stream/CMakeLists.txt index e70ed685d..2122b7867 100644 --- a/src/stream/CMakeLists.txt +++ b/src/stream/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/stream/COPY-Cuda.cpp b/src/stream/COPY-Cuda.cpp index cb3da418b..19e529416 100644 --- a/src/stream/COPY-Cuda.cpp +++ b/src/stream/COPY-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/COPY-Hip.cpp b/src/stream/COPY-Hip.cpp index 5541a2339..4234470c9 100644 --- a/src/stream/COPY-Hip.cpp +++ b/src/stream/COPY-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/COPY-OMP.cpp b/src/stream/COPY-OMP.cpp index fe35d5288..794097691 100644 --- a/src/stream/COPY-OMP.cpp +++ b/src/stream/COPY-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/COPY-OMPTarget.cpp b/src/stream/COPY-OMPTarget.cpp index 010456eb0..1c718c8c4 100644 --- a/src/stream/COPY-OMPTarget.cpp +++ b/src/stream/COPY-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/COPY-Seq.cpp b/src/stream/COPY-Seq.cpp index 89f9cae33..13d100a80 100644 --- a/src/stream/COPY-Seq.cpp +++ b/src/stream/COPY-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp index d8c7ec1d6..d90d9cf77 100644 --- a/src/stream/COPY.cpp +++ b/src/stream/COPY.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/COPY.hpp b/src/stream/COPY.hpp index 010a391c8..4c7cfcf42 100644 --- a/src/stream/COPY.hpp +++ b/src/stream/COPY.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index f4bbb92cf..395843680 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index 45257f97a..3e1f0a308 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/DOT-OMP.cpp b/src/stream/DOT-OMP.cpp index 24a29d9a0..9b88711ea 100644 --- a/src/stream/DOT-OMP.cpp +++ b/src/stream/DOT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/DOT-OMPTarget.cpp b/src/stream/DOT-OMPTarget.cpp index 1b4cb85cf..b5ef76685 100644 --- a/src/stream/DOT-OMPTarget.cpp +++ b/src/stream/DOT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/DOT-Seq.cpp b/src/stream/DOT-Seq.cpp index 81cff4c1b..2a1346d1f 100644 --- a/src/stream/DOT-Seq.cpp +++ b/src/stream/DOT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp index cca4aae4a..c968cf9cc 100644 --- a/src/stream/DOT.cpp +++ b/src/stream/DOT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp index adb9309c4..3ef9cf9f8 100644 --- a/src/stream/DOT.hpp +++ b/src/stream/DOT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/MUL-Cuda.cpp b/src/stream/MUL-Cuda.cpp index 2c38a5c7f..5bb8bf0bf 100644 --- a/src/stream/MUL-Cuda.cpp +++ b/src/stream/MUL-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/MUL-Hip.cpp b/src/stream/MUL-Hip.cpp index c2b65d8a0..a8fb0367c 100644 --- a/src/stream/MUL-Hip.cpp +++ b/src/stream/MUL-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/MUL-OMP.cpp b/src/stream/MUL-OMP.cpp index 0b7f3cd85..36a0343ea 100644 --- a/src/stream/MUL-OMP.cpp +++ b/src/stream/MUL-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/MUL-OMPTarget.cpp b/src/stream/MUL-OMPTarget.cpp index 8e5f52b35..dc35c5b69 100644 --- a/src/stream/MUL-OMPTarget.cpp +++ b/src/stream/MUL-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/MUL-Seq.cpp b/src/stream/MUL-Seq.cpp index 69b548e69..672f2f951 100644 --- a/src/stream/MUL-Seq.cpp +++ b/src/stream/MUL-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp index 6b167de04..1fc9e9912 100644 --- a/src/stream/MUL.cpp +++ b/src/stream/MUL.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/MUL.hpp b/src/stream/MUL.hpp index f8fcefbcb..0fe63f931 100644 --- a/src/stream/MUL.hpp +++ b/src/stream/MUL.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/TRIAD-Cuda.cpp b/src/stream/TRIAD-Cuda.cpp index d0908d6ff..d1c2c58ea 100644 --- a/src/stream/TRIAD-Cuda.cpp +++ b/src/stream/TRIAD-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/TRIAD-Hip.cpp b/src/stream/TRIAD-Hip.cpp index 21ed4478c..49c473134 100644 --- a/src/stream/TRIAD-Hip.cpp +++ b/src/stream/TRIAD-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/TRIAD-OMP.cpp b/src/stream/TRIAD-OMP.cpp index 9ce330c00..357300457 100644 --- a/src/stream/TRIAD-OMP.cpp +++ b/src/stream/TRIAD-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/TRIAD-OMPTarget.cpp b/src/stream/TRIAD-OMPTarget.cpp index 404444366..abb6389ac 100644 --- a/src/stream/TRIAD-OMPTarget.cpp +++ b/src/stream/TRIAD-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/TRIAD-Seq.cpp b/src/stream/TRIAD-Seq.cpp index 7d7800556..63310acd6 100644 --- a/src/stream/TRIAD-Seq.cpp +++ b/src/stream/TRIAD-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp index dfa04eda0..1533135f1 100644 --- a/src/stream/TRIAD.cpp +++ b/src/stream/TRIAD.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/TRIAD.hpp b/src/stream/TRIAD.hpp index 8d2f01236..897659e39 100644 --- a/src/stream/TRIAD.hpp +++ b/src/stream/TRIAD.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // From a0c9bc6f2d9e1b8d1dd6050c87efab4e03ea6aba Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 15 Mar 2022 12:50:43 -0700 Subject: [PATCH 263/392] Fixup run kernel prints --- src/common/Executor.cpp | 23 +++++++++++------------ src/common/Executor.hpp | 2 +- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index d2ac9117c..065cf0394 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -800,8 +800,7 @@ void Executor::runSuite() for (size_t ik = 0; ik < warmup_kernels.size(); ++ik) { KernelBase* warmup_kernel = warmup_kernels[ik]; - getCout() << "Kernel : " << warmup_kernel->getName() << endl; - runKernel(warmup_kernel); + runKernel(warmup_kernel, true); delete warmup_kernel; warmup_kernels[ik] = nullptr; } @@ -817,10 +816,7 @@ void Executor::runSuite() for (size_t ik = 0; ik < kernels.size(); ++ik) { KernelBase* kernel = kernels[ik]; - if ( run_params.showProgress() ) { - getCout() << "\nRun kernel -- " << kernel->getName() << "\n"; - } - runKernel(kernel); + runKernel(kernel, false); } // loop over kernels } // loop over passes through suite @@ -834,13 +830,16 @@ KernelBase* Executor::makeKernel() return kernel; } -void Executor::runKernel(KernelBase* kern) +void Executor::runKernel(KernelBase* kernel, bool print_kernel_name) { + if ( run_params.showProgress() || print_kernel_name) { + getCout() << endl << "Run kernel -- " << kernel->getName() << endl; + } for (size_t iv = 0; iv < variant_ids.size(); ++iv) { VariantID vid = variant_ids[iv]; if ( run_params.showProgress() ) { - if ( kern->hasVariantDefined(vid) ) { + if ( kernel->hasVariantDefined(vid) ) { getCout() << " Running "; } else { getCout() << " No "; @@ -848,15 +847,15 @@ void Executor::runKernel(KernelBase* kern) getCout() << getVariantName(vid) << " variant" << endl; } - for (size_t tid = 0; tid < kern->getNumVariantTunings(vid); ++tid) { + for (size_t tid = 0; tid < kernel->getNumVariantTunings(vid); ++tid) { if ( run_params.showProgress() ) { getCout() << " Running " - << kern->getVariantTuningName(vid, tid) << " tuning"; + << kernel->getVariantTuningName(vid, tid) << " tuning"; } - kern->execute(vid, tid); + kernel->execute(vid, tid); if ( run_params.showProgress() ) { - getCout() << " -- " << kern->getTotTime(vid) << " sec." << endl; + getCout() << " -- " << kernel->getTotTime(vid, tid) << " sec." << endl; } } } // loop over variants diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp index 39308e443..44c947f30 100644 --- a/src/common/Executor.hpp +++ b/src/common/Executor.hpp @@ -63,7 +63,7 @@ class Executor template < typename Kernel > KernelBase* makeKernel(); - void runKernel(KernelBase* kern); + void runKernel(KernelBase* kern, bool print_kernel_name); std::unique_ptr openOutputFile(const std::string& filename) const; From 088a1fb64cd5e3d789f37ae862645e4d72221c55 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 15 Mar 2022 12:52:09 -0700 Subject: [PATCH 264/392] Update DAXPY_ATOMIC and NODAL_ACCUMULATION_3D They are now fixed and support multiple gpu block sizes --- src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp | 37 ++++++++++++++---- src/apps/NODAL_ACCUMULATION_3D-Hip.cpp | 37 ++++++++++++++---- src/apps/NODAL_ACCUMULATION_3D-OMP.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D-Seq.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D.cpp | 8 ++-- src/apps/NODAL_ACCUMULATION_3D.hpp | 28 +++++++++----- src/basic/DAXPY_ATOMIC-Cuda.cpp | 40 +++++++++++++++----- src/basic/DAXPY_ATOMIC-Hip.cpp | 39 ++++++++++++++----- src/basic/DAXPY_ATOMIC-OMP.cpp | 2 +- src/basic/DAXPY_ATOMIC-OMPTarget.cpp | 2 +- src/basic/DAXPY_ATOMIC-Seq.cpp | 2 +- src/basic/DAXPY_ATOMIC.cpp | 8 ++-- src/basic/DAXPY_ATOMIC.hpp | 28 +++++++++----- 14 files changed, 171 insertions(+), 66 deletions(-) diff --git a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp index 3d29a5257..a55711021 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp @@ -23,12 +23,6 @@ namespace rajaperf namespace apps { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define NODAL_ACCUMULATION_3D_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(x, m_x, m_nodal_array_length); \ allocAndInitCudaDeviceData(vol, m_vol, m_zonal_array_length); \ @@ -40,6 +34,8 @@ namespace apps deallocCudaDeviceData(vol); \ deallocCudaDeviceData(real_zones); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void nodal_accumulation_3d(Real_ptr vol, Real_ptr x0, Real_ptr x1, Real_ptr x2, Real_ptr x3, @@ -57,7 +53,8 @@ __global__ void nodal_accumulation_3d(Real_ptr vol, } -void NODAL_ACCUMULATION_3D::runCudaVariant(VariantID vid) +template < size_t block_size > +void NODAL_ACCUMULATION_3D::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -76,7 +73,7 @@ void NODAL_ACCUMULATION_3D::runCudaVariant(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - nodal_accumulation_3d<<>>(vol, + nodal_accumulation_3d<<>>(vol, x0, x1, x2, x3, x4, x5, x6, x7, real_zones, ibegin, iend); @@ -116,6 +113,30 @@ void NODAL_ACCUMULATION_3D::runCudaVariant(VariantID vid) } } +void NODAL_ACCUMULATION_3D::runCudaVariant(VariantID vid, size_t tid) +{ + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void NODAL_ACCUMULATION_3D::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp index f9c2f5fe6..77b84d9f3 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp @@ -23,12 +23,6 @@ namespace rajaperf namespace apps { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define NODAL_ACCUMULATION_3D_DATA_SETUP_HIP \ allocAndInitHipDeviceData(x, m_x, m_nodal_array_length); \ allocAndInitHipDeviceData(vol, m_vol, m_zonal_array_length); \ @@ -40,6 +34,8 @@ namespace apps deallocHipDeviceData(vol); \ deallocHipDeviceData(real_zones); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void nodal_accumulation_3d(Real_ptr vol, Real_ptr x0, Real_ptr x1, Real_ptr x2, Real_ptr x3, @@ -57,7 +53,8 @@ __global__ void nodal_accumulation_3d(Real_ptr vol, } -void NODAL_ACCUMULATION_3D::runHipVariant(VariantID vid) +template < size_t block_size > +void NODAL_ACCUMULATION_3D::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -76,7 +73,7 @@ void NODAL_ACCUMULATION_3D::runHipVariant(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((nodal_accumulation_3d), dim3(grid_size), dim3(block_size), 0, 0, vol, + hipLaunchKernelGGL((nodal_accumulation_3d), dim3(grid_size), dim3(block_size), 0, 0, vol, x0, x1, x2, x3, x4, x5, x6, x7, real_zones, ibegin, iend); @@ -116,6 +113,30 @@ void NODAL_ACCUMULATION_3D::runHipVariant(VariantID vid) } } +void NODAL_ACCUMULATION_3D::runHipVariant(VariantID vid, size_t tid) +{ + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void NODAL_ACCUMULATION_3D::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp b/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp index 9676e1657..3a2025170 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp @@ -20,7 +20,7 @@ namespace apps { -void NODAL_ACCUMULATION_3D::runOpenMPVariant(VariantID vid) +void NODAL_ACCUMULATION_3D::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp b/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp index b53dad400..75e7c218e 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp @@ -43,7 +43,7 @@ namespace apps deallocOpenMPDeviceData(real_zones, did); -void NODAL_ACCUMULATION_3D::runOpenMPTargetVariant(VariantID vid) +void NODAL_ACCUMULATION_3D::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp b/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp index da26a1d68..17a6674a6 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp @@ -20,7 +20,7 @@ namespace apps { -void NODAL_ACCUMULATION_3D::runSeqVariant(VariantID vid) +void NODAL_ACCUMULATION_3D::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/NODAL_ACCUMULATION_3D.cpp b/src/apps/NODAL_ACCUMULATION_3D.cpp index d0bf52eeb..c300a0cb4 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D.cpp @@ -74,18 +74,18 @@ NODAL_ACCUMULATION_3D::~NODAL_ACCUMULATION_3D() delete m_domain; } -void NODAL_ACCUMULATION_3D::setUp(VariantID vid) +void NODAL_ACCUMULATION_3D::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_x, m_nodal_array_length, 0.0, vid); allocAndInitDataConst(m_vol, m_zonal_array_length, 1.0, vid); } -void NODAL_ACCUMULATION_3D::updateChecksum(VariantID vid) +void NODAL_ACCUMULATION_3D::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_x, m_nodal_array_length, checksum_scale_factor ); + checksum[vid].at(tid) += calcChecksum(m_x, m_nodal_array_length, checksum_scale_factor ); } -void NODAL_ACCUMULATION_3D::tearDown(VariantID vid) +void NODAL_ACCUMULATION_3D::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; diff --git a/src/apps/NODAL_ACCUMULATION_3D.hpp b/src/apps/NODAL_ACCUMULATION_3D.hpp index 43ff25703..04314881c 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.hpp +++ b/src/apps/NODAL_ACCUMULATION_3D.hpp @@ -86,17 +86,27 @@ class NODAL_ACCUMULATION_3D : public KernelBase ~NODAL_ACCUMULATION_3D(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_x; Real_ptr m_vol; diff --git a/src/basic/DAXPY_ATOMIC-Cuda.cpp b/src/basic/DAXPY_ATOMIC-Cuda.cpp index 11f230177..2eab245ff 100644 --- a/src/basic/DAXPY_ATOMIC-Cuda.cpp +++ b/src/basic/DAXPY_ATOMIC-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace basic { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define DAXPY_ATOMIC_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(x, m_x, iend); \ allocAndInitCudaDeviceData(y, m_y, iend); @@ -36,6 +30,8 @@ namespace basic deallocCudaDeviceData(x); \ deallocCudaDeviceData(y); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void daxpy_atomic(Real_ptr y, Real_ptr x, Real_type a, Index_type iend) @@ -46,7 +42,9 @@ __global__ void daxpy_atomic(Real_ptr y, Real_ptr x, } } -void DAXPY_ATOMIC::runCudaVariant(VariantID vid) + +template < size_t block_size > +void DAXPY_ATOMIC::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -62,7 +60,7 @@ void DAXPY_ATOMIC::runCudaVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - daxpy_atomic<<>>( y, x, a, + daxpy_atomic<<>>( y, x, a, iend ); cudaErrchk( cudaGetLastError() ); @@ -79,7 +77,7 @@ void DAXPY_ATOMIC::runCudaVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - lambda_cuda_forall<<>>( + lambda_cuda_forall<<>>( ibegin, iend, [=] __device__ (Index_type i) { DAXPY_ATOMIC_RAJA_BODY(RAJA::cuda_atomic); }); @@ -112,6 +110,30 @@ void DAXPY_ATOMIC::runCudaVariant(VariantID vid) } } +void DAXPY_ATOMIC::runCudaVariant(VariantID vid, size_t tid) +{ + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); +} + +void DAXPY_ATOMIC::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/DAXPY_ATOMIC-Hip.cpp b/src/basic/DAXPY_ATOMIC-Hip.cpp index 5035c51d9..42e16869e 100644 --- a/src/basic/DAXPY_ATOMIC-Hip.cpp +++ b/src/basic/DAXPY_ATOMIC-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace basic { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define DAXPY_ATOMIC_DATA_SETUP_HIP \ allocAndInitHipDeviceData(x, m_x, iend); \ allocAndInitHipDeviceData(y, m_y, iend); @@ -36,6 +30,8 @@ namespace basic deallocHipDeviceData(x); \ deallocHipDeviceData(y); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void daxpy_atomic(Real_ptr y, Real_ptr x, Real_type a, Index_type iend) @@ -47,7 +43,8 @@ __global__ void daxpy_atomic(Real_ptr y, Real_ptr x, } -void DAXPY_ATOMIC::runHipVariant(VariantID vid) +template < size_t block_size > +void DAXPY_ATOMIC::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -63,7 +60,7 @@ void DAXPY_ATOMIC::runHipVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((daxpy_atomic),dim3(grid_size), dim3(block_size), 0, 0, y, x, a, + hipLaunchKernelGGL((daxpy_atomic),dim3(grid_size), dim3(block_size), 0, 0, y, x, a, iend ); hipErrchk( hipGetLastError() ); @@ -84,7 +81,7 @@ void DAXPY_ATOMIC::runHipVariant(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL(lambda_hip_forall, + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, daxpy_atomic_lambda); hipErrchk( hipGetLastError() ); @@ -115,6 +112,30 @@ void DAXPY_ATOMIC::runHipVariant(VariantID vid) } } +void DAXPY_ATOMIC::runHipVariant(VariantID vid, size_t tid) +{ + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tid == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); +} + +void DAXPY_ATOMIC::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/DAXPY_ATOMIC-OMP.cpp b/src/basic/DAXPY_ATOMIC-OMP.cpp index d12bd99b9..0c58d6606 100644 --- a/src/basic/DAXPY_ATOMIC-OMP.cpp +++ b/src/basic/DAXPY_ATOMIC-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void DAXPY_ATOMIC::runOpenMPVariant(VariantID vid) +void DAXPY_ATOMIC::runOpenMPVariant(VariantID vid, size_t /*tid*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/DAXPY_ATOMIC-OMPTarget.cpp b/src/basic/DAXPY_ATOMIC-OMPTarget.cpp index e4beb5920..efbb43585 100644 --- a/src/basic/DAXPY_ATOMIC-OMPTarget.cpp +++ b/src/basic/DAXPY_ATOMIC-OMPTarget.cpp @@ -39,7 +39,7 @@ namespace basic deallocOpenMPDeviceData(y, did); -void DAXPY_ATOMIC::runOpenMPTargetVariant(VariantID vid) +void DAXPY_ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/DAXPY_ATOMIC-Seq.cpp b/src/basic/DAXPY_ATOMIC-Seq.cpp index 01f56305a..168eba20f 100644 --- a/src/basic/DAXPY_ATOMIC-Seq.cpp +++ b/src/basic/DAXPY_ATOMIC-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void DAXPY_ATOMIC::runSeqVariant(VariantID vid) +void DAXPY_ATOMIC::runSeqVariant(VariantID vid, size_t /*tid*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/DAXPY_ATOMIC.cpp b/src/basic/DAXPY_ATOMIC.cpp index e5545ca8b..853c6f320 100644 --- a/src/basic/DAXPY_ATOMIC.cpp +++ b/src/basic/DAXPY_ATOMIC.cpp @@ -57,19 +57,19 @@ DAXPY_ATOMIC::~DAXPY_ATOMIC() { } -void DAXPY_ATOMIC::setUp(VariantID vid) +void DAXPY_ATOMIC::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_y, getActualProblemSize(), 0.0, vid); allocAndInitData(m_x, getActualProblemSize(), vid); initData(m_a); } -void DAXPY_ATOMIC::updateChecksum(VariantID vid) +void DAXPY_ATOMIC::updateChecksum(VariantID vid, size_t tid) { - checksum[vid] += calcChecksum(m_y, getActualProblemSize()); + checksum[vid].at(tid) += calcChecksum(m_y, getActualProblemSize()); } -void DAXPY_ATOMIC::tearDown(VariantID vid) +void DAXPY_ATOMIC::tearDown(VariantID vid, size_t /*tid*/) { (void) vid; deallocData(m_x); diff --git a/src/basic/DAXPY_ATOMIC.hpp b/src/basic/DAXPY_ATOMIC.hpp index b557812e4..5cf3865d2 100644 --- a/src/basic/DAXPY_ATOMIC.hpp +++ b/src/basic/DAXPY_ATOMIC.hpp @@ -46,17 +46,27 @@ class DAXPY_ATOMIC : public KernelBase ~DAXPY_ATOMIC(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); + void setUp(VariantID vid, size_t tid); + void updateChecksum(VariantID vid, size_t tid); + void tearDown(VariantID vid, size_t tid); + + void runSeqVariant(VariantID vid, size_t tid); + void runOpenMPVariant(VariantID vid, size_t tid); + void runCudaVariant(VariantID vid, size_t tid); + void runHipVariant(VariantID vid, size_t tid); + void runOpenMPTargetVariant(VariantID vid, size_t tid); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_x; Real_ptr m_y; Real_type m_a; From 4657e457ef09e2f4d4bba0a5707d2112e6351164 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 16 Mar 2022 08:49:44 -0700 Subject: [PATCH 265/392] push empty commit to re-trigger CI From f0d3169d2211aa51f32b8673bfc0f4d65b141fbf Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 16 Mar 2022 08:51:42 -0700 Subject: [PATCH 266/392] Fix comment in script --- scripts/update_copyright.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/update_copyright.sh b/scripts/update_copyright.sh index 8f462663d..31a658efa 100755 --- a/scripts/update_copyright.sh +++ b/scripts/update_copyright.sh @@ -10,7 +10,7 @@ #============================================================================= # Change the copyright date in all files that contain the text -# "the RAJAPerf/COPYRIGHT file", which is part of the copyright statement +# "the RAJAPerf/LICENSE file", which is part of the copyright statement # at the top of each RAJA file. We use this to distinguish RAJA files from # that we do not own (e.g., other repos included as submodules), which we do # not want to modify. Note that this file and *.git files are omitted From 8c39c728ea1604b40fa0dc17f4c99126440eee3b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 16 Mar 2022 10:37:20 -0700 Subject: [PATCH 267/392] update copyrights --- src/common/GPUUtils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index af892051b..41db9e3fb 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // From c5bcc0297be837486f820dfa6abf219e23b310f6 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 16 Mar 2022 14:39:20 -0700 Subject: [PATCH 268/392] Output run time when showing progress properly Previously showed the total time instead of just the time for the last execution. --- src/common/Executor.cpp | 2 +- src/common/KernelBase.hpp | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index a1ce5e93e..bc600bd69 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -855,7 +855,7 @@ void Executor::runKernel(KernelBase* kernel, bool print_kernel_name) } kernel->execute(vid, tid); if ( run_params.showProgress() ) { - getCout() << " -- " << kernel->getTotTime(vid, tid) << " sec." << endl; + getCout() << " -- " << kernel->getLastTime() << " sec." << endl; } } } // loop over variants diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 900648830..b194a35f2 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -155,6 +155,10 @@ class KernelBase return false; } + // get runtime of executed variant/tuning + double getLastTime() const { return timer.elapsed(); } + + // get timers accumulated over npasses double getMinTime(VariantID vid, size_t tid) const { return min_time[vid].at(tid); } double getMaxTime(VariantID vid, size_t tid) const { return max_time[vid].at(tid); } double getTotTime(VariantID vid, size_t tid) { return tot_time[vid].at(tid); } From a7f646d92e519743bb1231630be7b2aff893f725 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 17 Mar 2022 10:16:28 -0700 Subject: [PATCH 269/392] Update RAJA to v2022.03.0 release --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index 648dc73b2..4351fe6a5 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 648dc73b26221f859d03b34fafa1ab7d6b3b661b +Subproject commit 4351fe6a50bd579511a625b017c9e054885e7fd2 From 93192f622878898bfaa5bf72bbd96f2bb6cb8027 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 17 Mar 2022 10:30:34 -0700 Subject: [PATCH 270/392] Remove uberenv, radiuss-spack-configs submodules and attempt to use RAJE versions of thes in Gitlab CI --- .gitmodules | 6 ------ .uberenv_config.json | 4 ++-- scripts/gitlab/build_and_test.sh | 2 +- scripts/radiuss-spack-configs | 1 - scripts/uberenv | 1 - 5 files changed, 3 insertions(+), 11 deletions(-) delete mode 160000 scripts/radiuss-spack-configs delete mode 160000 scripts/uberenv diff --git a/.gitmodules b/.gitmodules index 047c42160..13f05ecd3 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,9 +4,3 @@ [submodule "tpl/RAJA"] path = tpl/RAJA url = https://github.com/LLNL/RAJA.git -[submodule "scripts/radiuss-spack-configs"] - path = scripts/radiuss-spack-configs - url = https://github.com/LLNL/radiuss-spack-configs.git -[submodule "scripts/uberenv"] - path = scripts/uberenv - url = https://github.com/LLNL/uberenv.git diff --git a/.uberenv_config.json b/.uberenv_config.json index c60de2b4b..1e59c48f4 100644 --- a/.uberenv_config.json +++ b/.uberenv_config.json @@ -7,6 +7,6 @@ "spack_branch": "feature/allow-untested-cuda-versions", "spack_commit": "46b22d0f6227f6b12bab712bda5b916a53cfc67d", "spack_activate" : {}, -"spack_configs_path": "scripts/radiuss-spack-configs", -"spack_packages_path": "scripts/spack_packages" +"spack_configs_path": "tpl/RAJA/scripts/radiuss-spack-configs", +"spack_packages_path": "tpl/RAJA/scripts/spack_packages" } diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 04d7d0aed..493262dd1 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -56,7 +56,7 @@ then prefix_opt="--prefix=${prefix}" fi - python3 scripts/uberenv/uberenv.py --spec="${spec}" ${prefix_opt} + python3 tpl/RAJA/scripts/uberenv/uberenv.py --spec="${spec}" ${prefix_opt} fi date diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs deleted file mode 160000 index 7759bb0f9..000000000 --- a/scripts/radiuss-spack-configs +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 7759bb0f9f04fee0b85e9afc065cf2b5445c849e diff --git a/scripts/uberenv b/scripts/uberenv deleted file mode 160000 index 105e384f5..000000000 --- a/scripts/uberenv +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 105e384f585e2391c42b2def93124a6580319c1c From 923c97c3c885a7cf7d26930fd1d306c314794959 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 17 Mar 2022 10:40:25 -0700 Subject: [PATCH 271/392] Fix path to spack_packages --- .uberenv_config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.uberenv_config.json b/.uberenv_config.json index 1e59c48f4..6b9b7ca68 100644 --- a/.uberenv_config.json +++ b/.uberenv_config.json @@ -8,5 +8,5 @@ "spack_commit": "46b22d0f6227f6b12bab712bda5b916a53cfc67d", "spack_activate" : {}, "spack_configs_path": "tpl/RAJA/scripts/radiuss-spack-configs", -"spack_packages_path": "tpl/RAJA/scripts/spack_packages" +"spack_packages_path": "scripts/spack_packages" } From 6c405bd76b98f1e46c13f6b682387ded0a18381a Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 17 Mar 2022 11:04:38 -0700 Subject: [PATCH 272/392] Try fixing path to uberenv_config.json file. --- scripts/gitlab/build_and_test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 493262dd1..41f6f52fa 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -56,7 +56,7 @@ then prefix_opt="--prefix=${prefix}" fi - python3 tpl/RAJA/scripts/uberenv/uberenv.py --spec="${spec}" ${prefix_opt} + python3 tpl/RAJA/scripts/uberenv/uberenv.py --project-json=".uberenv_config.json" --spec="${spec}" ${prefix_opt} fi date From b30190285d4d869a91452e1652eea004ce3275d9 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 17 Mar 2022 13:17:45 -0700 Subject: [PATCH 273/392] Change path to spack-generated host-config file. --- scripts/gitlab/build_and_test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 41f6f52fa..5ee6815a8 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -66,7 +66,7 @@ if [[ -z ${hostconfig} ]] then # If no host config file was provided, we assume it was generated. # This means we are looking of a unique one in project dir. - hostconfigs=( $( ls "${project_dir}/"hc-*.cmake ) ) + hostconfigs=( $( ls "${project_dir}/tpl/RAJA/"hc-*.cmake ) ) if [[ ${#hostconfigs[@]} == 1 ]] then hostconfig_path=${hostconfigs[0]} From 16fab4270a4cd4b679549d3d1af400eea7e0e881 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 17 Mar 2022 15:20:02 -0700 Subject: [PATCH 274/392] mv host-config to top-level project directory --- scripts/gitlab/build_and_test.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 5ee6815a8..722297930 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -58,6 +58,8 @@ then python3 tpl/RAJA/scripts/uberenv/uberenv.py --project-json=".uberenv_config.json" --spec="${spec}" ${prefix_opt} + mv ${project_dir}/tpl/RAJA/hc-*.cmake ${project_dir}/. + fi date @@ -66,7 +68,7 @@ if [[ -z ${hostconfig} ]] then # If no host config file was provided, we assume it was generated. # This means we are looking of a unique one in project dir. - hostconfigs=( $( ls "${project_dir}/tpl/RAJA/"hc-*.cmake ) ) + hostconfigs=( $( ls "${project_dir}/"hc-*.cmake ) ) if [[ ${#hostconfigs[@]} == 1 ]] then hostconfig_path=${hostconfigs[0]} From 1f13a7109cc71802ec621b19d63936409df52573 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 17 Mar 2022 15:21:03 -0700 Subject: [PATCH 275/392] Update amdclang build script --- scripts/lc-builds/toss4_amdclang.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/lc-builds/toss4_amdclang.sh b/scripts/lc-builds/toss4_amdclang.sh index 31de03725..655990fa1 100755 --- a/scripts/lc-builds/toss4_amdclang.sh +++ b/scripts/lc-builds/toss4_amdclang.sh @@ -23,12 +23,11 @@ COMP_VER=$1 COMP_ARCH=$2 shift 2 -MY_HIP_ARCH_FLAGS="--offload-arch=${COMP_ARCH}" HOSTCONFIG="hip_3_X" if [[ ${COMP_VER} == 4.* ]] then -##HIP_CLANG_FLAGS="${MY_HIP_ARCH_FLAGS} -mllvm -amdgpu-fixed-function-abi=1" +##HIP_CLANG_FLAGS="-mllvm -amdgpu-fixed-function-abi=1" HOSTCONFIG="hip_4_link_X" elif [[ ${COMP_VER} == 3.* ]] then @@ -64,7 +63,7 @@ cmake \ -DHIP_PATH=/opt/rocm-${COMP_VER}/llvm/bin \ -DCMAKE_C_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/amdclang \ -DCMAKE_CXX_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/amdclang++ \ - -DCMAKE_HIP_ARCHITECTURES="${MY_HIP_ARCH_FLAGS}" \ + -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=OFF \ From 2e7b8c90946d7482d4ec40675f6c0cc87aab8d98 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 17 Mar 2022 16:30:17 -0700 Subject: [PATCH 276/392] Rename tid -> tune_idx --- src/algorithm/SORT-Cuda.cpp | 2 +- src/algorithm/SORT-Hip.cpp | 2 +- src/algorithm/SORT-OMP.cpp | 2 +- src/algorithm/SORT-Seq.cpp | 2 +- src/algorithm/SORT.cpp | 8 +- src/algorithm/SORT.hpp | 18 +-- src/algorithm/SORTPAIRS-Cuda.cpp | 2 +- src/algorithm/SORTPAIRS-Hip.cpp | 2 +- src/algorithm/SORTPAIRS-OMP.cpp | 2 +- src/algorithm/SORTPAIRS-Seq.cpp | 2 +- src/algorithm/SORTPAIRS.cpp | 10 +- src/algorithm/SORTPAIRS.hpp | 18 +-- src/apps/DEL_DOT_VEC_2D-Cuda.cpp | 4 +- src/apps/DEL_DOT_VEC_2D-Hip.cpp | 4 +- src/apps/DEL_DOT_VEC_2D-OMP.cpp | 2 +- src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp | 2 +- src/apps/DEL_DOT_VEC_2D-Seq.cpp | 2 +- src/apps/DEL_DOT_VEC_2D.cpp | 8 +- src/apps/DEL_DOT_VEC_2D.hpp | 18 +-- src/apps/DIFFUSION3DPA-Cuda.cpp | 2 +- src/apps/DIFFUSION3DPA-Hip.cpp | 2 +- src/apps/DIFFUSION3DPA-OMP.cpp | 2 +- src/apps/DIFFUSION3DPA-OMPTarget.cpp | 2 +- src/apps/DIFFUSION3DPA-Seq.cpp | 2 +- src/apps/DIFFUSION3DPA.cpp | 8 +- src/apps/DIFFUSION3DPA.hpp | 18 +-- src/apps/ENERGY-Cuda.cpp | 4 +- src/apps/ENERGY-Hip.cpp | 4 +- src/apps/ENERGY-OMP.cpp | 2 +- src/apps/ENERGY-OMPTarget.cpp | 2 +- src/apps/ENERGY-Seq.cpp | 2 +- src/apps/ENERGY.cpp | 10 +- src/apps/ENERGY.hpp | 18 +-- src/apps/FIR-Cuda.cpp | 4 +- src/apps/FIR-Hip.cpp | 4 +- src/apps/FIR-OMP.cpp | 2 +- src/apps/FIR-OMPTarget.cpp | 2 +- src/apps/FIR-Seq.cpp | 2 +- src/apps/FIR.cpp | 8 +- src/apps/FIR.hpp | 18 +-- src/apps/HALOEXCHANGE-Cuda.cpp | 4 +- src/apps/HALOEXCHANGE-Hip.cpp | 4 +- src/apps/HALOEXCHANGE-OMP.cpp | 2 +- src/apps/HALOEXCHANGE-OMPTarget.cpp | 2 +- src/apps/HALOEXCHANGE-Seq.cpp | 2 +- src/apps/HALOEXCHANGE.cpp | 8 +- src/apps/HALOEXCHANGE.hpp | 18 +-- src/apps/HALOEXCHANGE_FUSED-Cuda.cpp | 4 +- src/apps/HALOEXCHANGE_FUSED-Hip.cpp | 4 +- src/apps/HALOEXCHANGE_FUSED-OMP.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED-Seq.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED.cpp | 8 +- src/apps/HALOEXCHANGE_FUSED.hpp | 18 +-- src/apps/LTIMES-Cuda.cpp | 4 +- src/apps/LTIMES-Hip.cpp | 4 +- src/apps/LTIMES-OMP.cpp | 2 +- src/apps/LTIMES-OMPTarget.cpp | 2 +- src/apps/LTIMES-Seq.cpp | 2 +- src/apps/LTIMES.cpp | 8 +- src/apps/LTIMES.hpp | 18 +-- src/apps/LTIMES_NOVIEW-Cuda.cpp | 4 +- src/apps/LTIMES_NOVIEW-Hip.cpp | 4 +- src/apps/LTIMES_NOVIEW-OMP.cpp | 2 +- src/apps/LTIMES_NOVIEW-OMPTarget.cpp | 2 +- src/apps/LTIMES_NOVIEW-Seq.cpp | 2 +- src/apps/LTIMES_NOVIEW.cpp | 8 +- src/apps/LTIMES_NOVIEW.hpp | 18 +-- src/apps/MASS3DPA-Cuda.cpp | 2 +- src/apps/MASS3DPA-Hip.cpp | 2 +- src/apps/MASS3DPA-OMP.cpp | 2 +- src/apps/MASS3DPA-OMPTarget.cpp | 2 +- src/apps/MASS3DPA-Seq.cpp | 2 +- src/apps/MASS3DPA.cpp | 8 +- src/apps/MASS3DPA.hpp | 18 +-- src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp | 4 +- src/apps/NODAL_ACCUMULATION_3D-Hip.cpp | 4 +- src/apps/NODAL_ACCUMULATION_3D-OMP.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D-Seq.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D.cpp | 8 +- src/apps/NODAL_ACCUMULATION_3D.hpp | 18 +-- src/apps/PRESSURE-Cuda.cpp | 4 +- src/apps/PRESSURE-Hip.cpp | 4 +- src/apps/PRESSURE-OMP.cpp | 2 +- src/apps/PRESSURE-OMPTarget.cpp | 2 +- src/apps/PRESSURE-Seq.cpp | 2 +- src/apps/PRESSURE.cpp | 8 +- src/apps/PRESSURE.hpp | 18 +-- src/apps/VOL3D-Cuda.cpp | 4 +- src/apps/VOL3D-Hip.cpp | 4 +- src/apps/VOL3D-OMP.cpp | 2 +- src/apps/VOL3D-OMPTarget.cpp | 2 +- src/apps/VOL3D-Seq.cpp | 2 +- src/apps/VOL3D.cpp | 8 +- src/apps/VOL3D.hpp | 18 +-- src/apps/WIP-COUPLE.cpp | 20 ++-- src/apps/WIP-COUPLE.hpp | 20 ++-- src/basic/DAXPY-Cuda.cpp | 4 +- src/basic/DAXPY-Hip.cpp | 4 +- src/basic/DAXPY-OMP.cpp | 2 +- src/basic/DAXPY-OMPTarget.cpp | 2 +- src/basic/DAXPY-Seq.cpp | 2 +- src/basic/DAXPY.cpp | 8 +- src/basic/DAXPY.hpp | 18 +-- src/basic/DAXPY_ATOMIC-Cuda.cpp | 4 +- src/basic/DAXPY_ATOMIC-Hip.cpp | 4 +- src/basic/DAXPY_ATOMIC-OMP.cpp | 2 +- src/basic/DAXPY_ATOMIC-OMPTarget.cpp | 2 +- src/basic/DAXPY_ATOMIC-Seq.cpp | 2 +- src/basic/DAXPY_ATOMIC.cpp | 8 +- src/basic/DAXPY_ATOMIC.hpp | 18 +-- src/basic/IF_QUAD-Cuda.cpp | 4 +- src/basic/IF_QUAD-Hip.cpp | 4 +- src/basic/IF_QUAD-OMP.cpp | 2 +- src/basic/IF_QUAD-OMPTarget.cpp | 2 +- src/basic/IF_QUAD-Seq.cpp | 2 +- src/basic/IF_QUAD.cpp | 10 +- src/basic/IF_QUAD.hpp | 18 +-- src/basic/INIT3-Cuda.cpp | 4 +- src/basic/INIT3-Hip.cpp | 4 +- src/basic/INIT3-OMP.cpp | 2 +- src/basic/INIT3-OMPTarget.cpp | 2 +- src/basic/INIT3-Seq.cpp | 2 +- src/basic/INIT3.cpp | 12 +- src/basic/INIT3.hpp | 18 +-- src/basic/INIT_VIEW1D-Cuda.cpp | 4 +- src/basic/INIT_VIEW1D-Hip.cpp | 4 +- src/basic/INIT_VIEW1D-OMP.cpp | 2 +- src/basic/INIT_VIEW1D-OMPTarget.cpp | 2 +- src/basic/INIT_VIEW1D-Seq.cpp | 2 +- src/basic/INIT_VIEW1D.cpp | 8 +- src/basic/INIT_VIEW1D.hpp | 18 +-- src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp | 4 +- src/basic/INIT_VIEW1D_OFFSET-Hip.cpp | 4 +- src/basic/INIT_VIEW1D_OFFSET-OMP.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-Seq.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET.cpp | 8 +- src/basic/INIT_VIEW1D_OFFSET.hpp | 18 +-- src/basic/MAT_MAT_SHARED-Cuda.cpp | 4 +- src/basic/MAT_MAT_SHARED-Hip.cpp | 4 +- src/basic/MAT_MAT_SHARED-OMP.cpp | 2 +- src/basic/MAT_MAT_SHARED-OMPTarget.cpp | 2 +- src/basic/MAT_MAT_SHARED-Seq.cpp | 2 +- src/basic/MAT_MAT_SHARED.cpp | 8 +- src/basic/MAT_MAT_SHARED.hpp | 18 +-- src/basic/MULADDSUB-Cuda.cpp | 4 +- src/basic/MULADDSUB-Hip.cpp | 4 +- src/basic/MULADDSUB-OMP.cpp | 2 +- src/basic/MULADDSUB-OMPTarget.cpp | 2 +- src/basic/MULADDSUB-Seq.cpp | 2 +- src/basic/MULADDSUB.cpp | 12 +- src/basic/MULADDSUB.hpp | 18 +-- src/basic/NESTED_INIT-Cuda.cpp | 4 +- src/basic/NESTED_INIT-Hip.cpp | 4 +- src/basic/NESTED_INIT-OMP.cpp | 2 +- src/basic/NESTED_INIT-OMPTarget.cpp | 2 +- src/basic/NESTED_INIT-Seq.cpp | 2 +- src/basic/NESTED_INIT.cpp | 8 +- src/basic/NESTED_INIT.hpp | 18 +-- src/basic/PI_ATOMIC-Cuda.cpp | 4 +- src/basic/PI_ATOMIC-Hip.cpp | 4 +- src/basic/PI_ATOMIC-OMP.cpp | 2 +- src/basic/PI_ATOMIC-OMPTarget.cpp | 2 +- src/basic/PI_ATOMIC-Seq.cpp | 2 +- src/basic/PI_ATOMIC.cpp | 8 +- src/basic/PI_ATOMIC.hpp | 18 +-- src/basic/PI_REDUCE-Cuda.cpp | 4 +- src/basic/PI_REDUCE-Hip.cpp | 4 +- src/basic/PI_REDUCE-OMP.cpp | 2 +- src/basic/PI_REDUCE-OMPTarget.cpp | 2 +- src/basic/PI_REDUCE-Seq.cpp | 2 +- src/basic/PI_REDUCE.cpp | 8 +- src/basic/PI_REDUCE.hpp | 18 +-- src/basic/REDUCE3_INT-Cuda.cpp | 4 +- src/basic/REDUCE3_INT-Hip.cpp | 4 +- src/basic/REDUCE3_INT-OMP.cpp | 2 +- src/basic/REDUCE3_INT-OMPTarget.cpp | 2 +- src/basic/REDUCE3_INT-Seq.cpp | 2 +- src/basic/REDUCE3_INT.cpp | 12 +- src/basic/REDUCE3_INT.hpp | 18 +-- src/basic/TRAP_INT-Cuda.cpp | 4 +- src/basic/TRAP_INT-Hip.cpp | 4 +- src/basic/TRAP_INT-OMP.cpp | 2 +- src/basic/TRAP_INT-OMPTarget.cpp | 2 +- src/basic/TRAP_INT-Seq.cpp | 2 +- src/basic/TRAP_INT.cpp | 8 +- src/basic/TRAP_INT.hpp | 18 +-- src/common/Executor.cpp | 112 +++++++++--------- src/common/Executor.hpp | 4 +- src/common/KernelBase.cpp | 26 ++-- src/common/KernelBase.hpp | 42 +++---- src/lcals/DIFF_PREDICT-Cuda.cpp | 4 +- src/lcals/DIFF_PREDICT-Hip.cpp | 4 +- src/lcals/DIFF_PREDICT-OMP.cpp | 2 +- src/lcals/DIFF_PREDICT-OMPTarget.cpp | 2 +- src/lcals/DIFF_PREDICT-Seq.cpp | 2 +- src/lcals/DIFF_PREDICT.cpp | 8 +- src/lcals/DIFF_PREDICT.hpp | 18 +-- src/lcals/EOS-Cuda.cpp | 4 +- src/lcals/EOS-Hip.cpp | 4 +- src/lcals/EOS-OMP.cpp | 2 +- src/lcals/EOS-OMPTarget.cpp | 2 +- src/lcals/EOS-Seq.cpp | 2 +- src/lcals/EOS.cpp | 8 +- src/lcals/EOS.hpp | 18 +-- src/lcals/FIRST_DIFF-Cuda.cpp | 4 +- src/lcals/FIRST_DIFF-Hip.cpp | 4 +- src/lcals/FIRST_DIFF-OMP.cpp | 2 +- src/lcals/FIRST_DIFF-OMPTarget.cpp | 2 +- src/lcals/FIRST_DIFF-Seq.cpp | 2 +- src/lcals/FIRST_DIFF.cpp | 8 +- src/lcals/FIRST_DIFF.hpp | 18 +-- src/lcals/FIRST_MIN-Cuda.cpp | 4 +- src/lcals/FIRST_MIN-Hip.cpp | 4 +- src/lcals/FIRST_MIN-OMP.cpp | 2 +- src/lcals/FIRST_MIN-OMPTarget.cpp | 2 +- src/lcals/FIRST_MIN-Seq.cpp | 2 +- src/lcals/FIRST_MIN.cpp | 8 +- src/lcals/FIRST_MIN.hpp | 18 +-- src/lcals/FIRST_SUM-Cuda.cpp | 4 +- src/lcals/FIRST_SUM-Hip.cpp | 4 +- src/lcals/FIRST_SUM-OMP.cpp | 2 +- src/lcals/FIRST_SUM-OMPTarget.cpp | 2 +- src/lcals/FIRST_SUM-Seq.cpp | 2 +- src/lcals/FIRST_SUM.cpp | 8 +- src/lcals/FIRST_SUM.hpp | 18 +-- src/lcals/GEN_LIN_RECUR-Cuda.cpp | 4 +- src/lcals/GEN_LIN_RECUR-Hip.cpp | 4 +- src/lcals/GEN_LIN_RECUR-OMP.cpp | 2 +- src/lcals/GEN_LIN_RECUR-OMPTarget.cpp | 2 +- src/lcals/GEN_LIN_RECUR-Seq.cpp | 2 +- src/lcals/GEN_LIN_RECUR.cpp | 8 +- src/lcals/GEN_LIN_RECUR.hpp | 18 +-- src/lcals/HYDRO_1D-Cuda.cpp | 4 +- src/lcals/HYDRO_1D-Hip.cpp | 4 +- src/lcals/HYDRO_1D-OMP.cpp | 2 +- src/lcals/HYDRO_1D-OMPTarget.cpp | 2 +- src/lcals/HYDRO_1D-Seq.cpp | 2 +- src/lcals/HYDRO_1D.cpp | 8 +- src/lcals/HYDRO_1D.hpp | 18 +-- src/lcals/HYDRO_2D-Cuda.cpp | 4 +- src/lcals/HYDRO_2D-Hip.cpp | 4 +- src/lcals/HYDRO_2D-OMP.cpp | 2 +- src/lcals/HYDRO_2D-OMPTarget.cpp | 2 +- src/lcals/HYDRO_2D-Seq.cpp | 2 +- src/lcals/HYDRO_2D.cpp | 10 +- src/lcals/HYDRO_2D.hpp | 18 +-- src/lcals/INT_PREDICT-Cuda.cpp | 4 +- src/lcals/INT_PREDICT-Hip.cpp | 4 +- src/lcals/INT_PREDICT-OMP.cpp | 2 +- src/lcals/INT_PREDICT-OMPTarget.cpp | 2 +- src/lcals/INT_PREDICT-Seq.cpp | 2 +- src/lcals/INT_PREDICT.cpp | 8 +- src/lcals/INT_PREDICT.hpp | 18 +-- src/lcals/PLANCKIAN-Cuda.cpp | 4 +- src/lcals/PLANCKIAN-Hip.cpp | 4 +- src/lcals/PLANCKIAN-OMP.cpp | 2 +- src/lcals/PLANCKIAN-OMPTarget.cpp | 2 +- src/lcals/PLANCKIAN-Seq.cpp | 2 +- src/lcals/PLANCKIAN.cpp | 8 +- src/lcals/PLANCKIAN.hpp | 18 +-- src/lcals/TRIDIAG_ELIM-Cuda.cpp | 4 +- src/lcals/TRIDIAG_ELIM-Hip.cpp | 4 +- src/lcals/TRIDIAG_ELIM-OMP.cpp | 2 +- src/lcals/TRIDIAG_ELIM-OMPTarget.cpp | 2 +- src/lcals/TRIDIAG_ELIM-Seq.cpp | 2 +- src/lcals/TRIDIAG_ELIM.cpp | 8 +- src/lcals/TRIDIAG_ELIM.hpp | 18 +-- src/polybench/POLYBENCH_2MM-Cuda.cpp | 4 +- src/polybench/POLYBENCH_2MM-Hip.cpp | 4 +- src/polybench/POLYBENCH_2MM-OMP.cpp | 2 +- src/polybench/POLYBENCH_2MM-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_2MM-Seq.cpp | 2 +- src/polybench/POLYBENCH_2MM.cpp | 8 +- src/polybench/POLYBENCH_2MM.hpp | 18 +-- src/polybench/POLYBENCH_3MM-Cuda.cpp | 4 +- src/polybench/POLYBENCH_3MM-Hip.cpp | 4 +- src/polybench/POLYBENCH_3MM-OMP.cpp | 2 +- src/polybench/POLYBENCH_3MM-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_3MM-Seq.cpp | 2 +- src/polybench/POLYBENCH_3MM.cpp | 8 +- src/polybench/POLYBENCH_3MM.hpp | 18 +-- src/polybench/POLYBENCH_ADI-Cuda.cpp | 4 +- src/polybench/POLYBENCH_ADI-Hip.cpp | 4 +- src/polybench/POLYBENCH_ADI-OMP.cpp | 2 +- src/polybench/POLYBENCH_ADI-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_ADI-Seq.cpp | 2 +- src/polybench/POLYBENCH_ADI.cpp | 8 +- src/polybench/POLYBENCH_ADI.hpp | 18 +-- src/polybench/POLYBENCH_ATAX-Cuda.cpp | 4 +- src/polybench/POLYBENCH_ATAX-Hip.cpp | 4 +- src/polybench/POLYBENCH_ATAX-OMP.cpp | 2 +- src/polybench/POLYBENCH_ATAX-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_ATAX-Seq.cpp | 2 +- src/polybench/POLYBENCH_ATAX.cpp | 8 +- src/polybench/POLYBENCH_ATAX.hpp | 18 +-- src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp | 4 +- src/polybench/POLYBENCH_FDTD_2D-Hip.cpp | 4 +- src/polybench/POLYBENCH_FDTD_2D-OMP.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D-Seq.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D.cpp | 8 +- src/polybench/POLYBENCH_FDTD_2D.hpp | 18 +-- .../POLYBENCH_FLOYD_WARSHALL-Cuda.cpp | 4 +- .../POLYBENCH_FLOYD_WARSHALL-Hip.cpp | 4 +- .../POLYBENCH_FLOYD_WARSHALL-OMP.cpp | 2 +- .../POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp | 2 +- .../POLYBENCH_FLOYD_WARSHALL-Seq.cpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp | 8 +- src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp | 18 +-- src/polybench/POLYBENCH_GEMM-Cuda.cpp | 4 +- src/polybench/POLYBENCH_GEMM-Hip.cpp | 4 +- src/polybench/POLYBENCH_GEMM-OMP.cpp | 2 +- src/polybench/POLYBENCH_GEMM-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_GEMM-Seq.cpp | 2 +- src/polybench/POLYBENCH_GEMM.cpp | 8 +- src/polybench/POLYBENCH_GEMM.hpp | 18 +-- src/polybench/POLYBENCH_GEMVER-Cuda.cpp | 4 +- src/polybench/POLYBENCH_GEMVER-Hip.cpp | 4 +- src/polybench/POLYBENCH_GEMVER-OMP.cpp | 2 +- src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_GEMVER-Seq.cpp | 2 +- src/polybench/POLYBENCH_GEMVER.cpp | 8 +- src/polybench/POLYBENCH_GEMVER.hpp | 18 +-- src/polybench/POLYBENCH_GESUMMV-Cuda.cpp | 4 +- src/polybench/POLYBENCH_GESUMMV-Hip.cpp | 4 +- src/polybench/POLYBENCH_GESUMMV-OMP.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV-Seq.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV.cpp | 8 +- src/polybench/POLYBENCH_GESUMMV.hpp | 18 +-- src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp | 4 +- src/polybench/POLYBENCH_HEAT_3D-Hip.cpp | 4 +- src/polybench/POLYBENCH_HEAT_3D-OMP.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D-Seq.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D.cpp | 10 +- src/polybench/POLYBENCH_HEAT_3D.hpp | 18 +-- src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp | 4 +- src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp | 4 +- src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp | 2 +- .../POLYBENCH_JACOBI_1D-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D.cpp | 10 +- src/polybench/POLYBENCH_JACOBI_1D.hpp | 18 +-- src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp | 4 +- src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp | 4 +- src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp | 2 +- .../POLYBENCH_JACOBI_2D-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D.cpp | 10 +- src/polybench/POLYBENCH_JACOBI_2D.hpp | 18 +-- src/polybench/POLYBENCH_MVT-Cuda.cpp | 4 +- src/polybench/POLYBENCH_MVT-Hip.cpp | 4 +- src/polybench/POLYBENCH_MVT-OMP.cpp | 2 +- src/polybench/POLYBENCH_MVT-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_MVT-Seq.cpp | 2 +- src/polybench/POLYBENCH_MVT.cpp | 10 +- src/polybench/POLYBENCH_MVT.hpp | 18 +-- src/stream/ADD-Cuda.cpp | 4 +- src/stream/ADD-Hip.cpp | 4 +- src/stream/ADD-OMP.cpp | 2 +- src/stream/ADD-OMPTarget.cpp | 2 +- src/stream/ADD-Seq.cpp | 2 +- src/stream/ADD.cpp | 8 +- src/stream/ADD.hpp | 18 +-- src/stream/COPY-Cuda.cpp | 4 +- src/stream/COPY-Hip.cpp | 4 +- src/stream/COPY-OMP.cpp | 2 +- src/stream/COPY-OMPTarget.cpp | 2 +- src/stream/COPY-Seq.cpp | 2 +- src/stream/COPY.cpp | 8 +- src/stream/COPY.hpp | 18 +-- src/stream/DOT-Cuda.cpp | 4 +- src/stream/DOT-Hip.cpp | 4 +- src/stream/DOT-OMP.cpp | 2 +- src/stream/DOT-OMPTarget.cpp | 2 +- src/stream/DOT-Seq.cpp | 2 +- src/stream/DOT.cpp | 8 +- src/stream/DOT.hpp | 18 +-- src/stream/MUL-Cuda.cpp | 4 +- src/stream/MUL-Hip.cpp | 4 +- src/stream/MUL-OMP.cpp | 2 +- src/stream/MUL-OMPTarget.cpp | 2 +- src/stream/MUL-Seq.cpp | 2 +- src/stream/MUL.cpp | 8 +- src/stream/MUL.hpp | 18 +-- src/stream/TRIAD-Cuda.cpp | 4 +- src/stream/TRIAD-Hip.cpp | 4 +- src/stream/TRIAD-OMP.cpp | 2 +- src/stream/TRIAD-OMPTarget.cpp | 2 +- src/stream/TRIAD-Seq.cpp | 2 +- src/stream/TRIAD.cpp | 8 +- src/stream/TRIAD.hpp | 18 +-- 396 files changed, 1236 insertions(+), 1236 deletions(-) diff --git a/src/algorithm/SORT-Cuda.cpp b/src/algorithm/SORT-Cuda.cpp index eb7ff15a5..abba6336f 100644 --- a/src/algorithm/SORT-Cuda.cpp +++ b/src/algorithm/SORT-Cuda.cpp @@ -35,7 +35,7 @@ namespace algorithm deallocCudaDeviceData(x); -void SORT::runCudaVariant(VariantID vid, size_t /*tid*/) +void SORT::runCudaVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/algorithm/SORT-Hip.cpp b/src/algorithm/SORT-Hip.cpp index b4bcb0281..aa6ab1a03 100644 --- a/src/algorithm/SORT-Hip.cpp +++ b/src/algorithm/SORT-Hip.cpp @@ -35,7 +35,7 @@ namespace algorithm deallocHipDeviceData(x); -void SORT::runHipVariant(VariantID vid, size_t /*tid*/) +void SORT::runHipVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/algorithm/SORT-OMP.cpp b/src/algorithm/SORT-OMP.cpp index 0330926ad..0528bcbce 100644 --- a/src/algorithm/SORT-OMP.cpp +++ b/src/algorithm/SORT-OMP.cpp @@ -18,7 +18,7 @@ namespace algorithm { -void SORT::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void SORT::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/algorithm/SORT-Seq.cpp b/src/algorithm/SORT-Seq.cpp index 4da7b1fbf..0091efdfc 100644 --- a/src/algorithm/SORT-Seq.cpp +++ b/src/algorithm/SORT-Seq.cpp @@ -18,7 +18,7 @@ namespace algorithm { -void SORT::runSeqVariant(VariantID vid, size_t /*tid*/) +void SORT::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/algorithm/SORT.cpp b/src/algorithm/SORT.cpp index 70b76fb4f..6a199b53a 100644 --- a/src/algorithm/SORT.cpp +++ b/src/algorithm/SORT.cpp @@ -47,17 +47,17 @@ SORT::~SORT() { } -void SORT::setUp(VariantID vid, size_t /*tid*/) +void SORT::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataRandValue(m_x, getActualProblemSize()*getRunReps(), vid); } -void SORT::updateChecksum(VariantID vid, size_t tid) +void SORT::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_x, getActualProblemSize()*getRunReps()); + checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize()*getRunReps()); } -void SORT::tearDown(VariantID vid, size_t /*tid*/) +void SORT::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_x); diff --git a/src/algorithm/SORT.hpp b/src/algorithm/SORT.hpp index 1cd7dc837..5999ea637 100644 --- a/src/algorithm/SORT.hpp +++ b/src/algorithm/SORT.hpp @@ -42,15 +42,15 @@ class SORT : public KernelBase ~SORT(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { getCout() << "\n SORT : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/algorithm/SORTPAIRS-Cuda.cpp b/src/algorithm/SORTPAIRS-Cuda.cpp index a551bf7c7..c44186ff2 100644 --- a/src/algorithm/SORTPAIRS-Cuda.cpp +++ b/src/algorithm/SORTPAIRS-Cuda.cpp @@ -38,7 +38,7 @@ namespace algorithm deallocCudaDeviceData(i); -void SORTPAIRS::runCudaVariant(VariantID vid, size_t /*tid*/) +void SORTPAIRS::runCudaVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/algorithm/SORTPAIRS-Hip.cpp b/src/algorithm/SORTPAIRS-Hip.cpp index 13e6d5a89..884b401e0 100644 --- a/src/algorithm/SORTPAIRS-Hip.cpp +++ b/src/algorithm/SORTPAIRS-Hip.cpp @@ -38,7 +38,7 @@ namespace algorithm deallocHipDeviceData(i); -void SORTPAIRS::runHipVariant(VariantID vid, size_t /*tid*/) +void SORTPAIRS::runHipVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/algorithm/SORTPAIRS-OMP.cpp b/src/algorithm/SORTPAIRS-OMP.cpp index e5285fd80..e36ec1466 100644 --- a/src/algorithm/SORTPAIRS-OMP.cpp +++ b/src/algorithm/SORTPAIRS-OMP.cpp @@ -18,7 +18,7 @@ namespace algorithm { -void SORTPAIRS::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void SORTPAIRS::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/algorithm/SORTPAIRS-Seq.cpp b/src/algorithm/SORTPAIRS-Seq.cpp index d152ad9b2..109bc201c 100644 --- a/src/algorithm/SORTPAIRS-Seq.cpp +++ b/src/algorithm/SORTPAIRS-Seq.cpp @@ -21,7 +21,7 @@ namespace algorithm { -void SORTPAIRS::runSeqVariant(VariantID vid, size_t /*tid*/) +void SORTPAIRS::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/algorithm/SORTPAIRS.cpp b/src/algorithm/SORTPAIRS.cpp index 8ed8aaccf..335ff6e28 100644 --- a/src/algorithm/SORTPAIRS.cpp +++ b/src/algorithm/SORTPAIRS.cpp @@ -47,19 +47,19 @@ SORTPAIRS::~SORTPAIRS() { } -void SORTPAIRS::setUp(VariantID vid, size_t /*tid*/) +void SORTPAIRS::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataRandValue(m_x, getActualProblemSize()*getRunReps(), vid); allocAndInitDataRandValue(m_i, getActualProblemSize()*getRunReps(), vid); } -void SORTPAIRS::updateChecksum(VariantID vid, size_t tid) +void SORTPAIRS::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_x, getActualProblemSize()*getRunReps()); - checksum[vid][tid] += calcChecksum(m_i, getActualProblemSize()*getRunReps()); + checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize()*getRunReps()); + checksum[vid][tune_idx] += calcChecksum(m_i, getActualProblemSize()*getRunReps()); } -void SORTPAIRS::tearDown(VariantID vid, size_t /*tid*/) +void SORTPAIRS::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_x); diff --git a/src/algorithm/SORTPAIRS.hpp b/src/algorithm/SORTPAIRS.hpp index 428cdaf32..b4a2aa02d 100644 --- a/src/algorithm/SORTPAIRS.hpp +++ b/src/algorithm/SORTPAIRS.hpp @@ -41,15 +41,15 @@ class SORTPAIRS : public KernelBase ~SORTPAIRS(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { getCout() << "\n SORTPAIRS : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp index fb6c3c336..cf059127a 100644 --- a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp @@ -162,13 +162,13 @@ void DEL_DOT_VEC_2D::runCudaVariantImpl(VariantID vid) } } -void DEL_DOT_VEC_2D::runCudaVariant(VariantID vid, size_t tid) +void DEL_DOT_VEC_2D::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/apps/DEL_DOT_VEC_2D-Hip.cpp b/src/apps/DEL_DOT_VEC_2D-Hip.cpp index 65c8ea657..24bdb0345 100644 --- a/src/apps/DEL_DOT_VEC_2D-Hip.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Hip.cpp @@ -164,13 +164,13 @@ void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid) } } -void DEL_DOT_VEC_2D::runHipVariant(VariantID vid, size_t tid) +void DEL_DOT_VEC_2D::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/apps/DEL_DOT_VEC_2D-OMP.cpp b/src/apps/DEL_DOT_VEC_2D-OMP.cpp index 5b74c289f..79ffce156 100644 --- a/src/apps/DEL_DOT_VEC_2D-OMP.cpp +++ b/src/apps/DEL_DOT_VEC_2D-OMP.cpp @@ -22,7 +22,7 @@ namespace apps { -void DEL_DOT_VEC_2D::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void DEL_DOT_VEC_2D::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp index 4c324b1d8..742647fef 100644 --- a/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp +++ b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp @@ -51,7 +51,7 @@ namespace apps deallocOpenMPDeviceData(real_zones, did); -void DEL_DOT_VEC_2D::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void DEL_DOT_VEC_2D::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/DEL_DOT_VEC_2D-Seq.cpp b/src/apps/DEL_DOT_VEC_2D-Seq.cpp index d3bc1360f..23e4803c6 100644 --- a/src/apps/DEL_DOT_VEC_2D-Seq.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Seq.cpp @@ -22,7 +22,7 @@ namespace apps { -void DEL_DOT_VEC_2D::runSeqVariant(VariantID vid, size_t /*tid*/) +void DEL_DOT_VEC_2D::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index 6dd77bc3c..b48de0fbc 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -69,7 +69,7 @@ DEL_DOT_VEC_2D::~DEL_DOT_VEC_2D() delete m_domain; } -void DEL_DOT_VEC_2D::setUp(VariantID vid, size_t /*tid*/) +void DEL_DOT_VEC_2D::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_x, m_array_length, 0.0, vid); allocAndInitDataConst(m_y, m_array_length, 0.0, vid); @@ -87,12 +87,12 @@ void DEL_DOT_VEC_2D::setUp(VariantID vid, size_t /*tid*/) m_half = 0.5; } -void DEL_DOT_VEC_2D::updateChecksum(VariantID vid, size_t tid) +void DEL_DOT_VEC_2D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_div, m_array_length); + checksum[vid][tune_idx] += calcChecksum(m_div, m_array_length); } -void DEL_DOT_VEC_2D::tearDown(VariantID vid, size_t /*tid*/) +void DEL_DOT_VEC_2D::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; diff --git a/src/apps/DEL_DOT_VEC_2D.hpp b/src/apps/DEL_DOT_VEC_2D.hpp index f5038be83..f75491d63 100644 --- a/src/apps/DEL_DOT_VEC_2D.hpp +++ b/src/apps/DEL_DOT_VEC_2D.hpp @@ -104,15 +104,15 @@ class DEL_DOT_VEC_2D : public KernelBase ~DEL_DOT_VEC_2D(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index 614428eed..2990a63a5 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -120,7 +120,7 @@ __global__ void Diffusion3DPA(Index_type NE, const Real_ptr Basis, } } -void DIFFUSION3DPA::runCudaVariant(VariantID vid, size_t tid) { +void DIFFUSION3DPA::runCudaVariant(VariantID vid, size_t tune_idx) { const Index_type run_reps = getRunReps(); DIFFUSION3DPA_DATA_SETUP; diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index b403023e7..ad962a52d 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -120,7 +120,7 @@ __global__ void Diffusion3DPA(Index_type NE, const Real_ptr Basis, } } -void DIFFUSION3DPA::runHipVariant(VariantID vid, size_t tid) { +void DIFFUSION3DPA::runHipVariant(VariantID vid, size_t tune_idx) { const Index_type run_reps = getRunReps(); DIFFUSION3DPA_DATA_SETUP; diff --git a/src/apps/DIFFUSION3DPA-OMP.cpp b/src/apps/DIFFUSION3DPA-OMP.cpp index 74b732690..eb07e62e2 100644 --- a/src/apps/DIFFUSION3DPA-OMP.cpp +++ b/src/apps/DIFFUSION3DPA-OMP.cpp @@ -18,7 +18,7 @@ namespace rajaperf { namespace apps { -void DIFFUSION3DPA::runOpenMPVariant(VariantID vid, size_t /*tid*/) { +void DIFFUSION3DPA::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/DIFFUSION3DPA-OMPTarget.cpp b/src/apps/DIFFUSION3DPA-OMPTarget.cpp index 4b5452c6f..7f4273747 100644 --- a/src/apps/DIFFUSION3DPA-OMPTarget.cpp +++ b/src/apps/DIFFUSION3DPA-OMPTarget.cpp @@ -19,7 +19,7 @@ namespace rajaperf { namespace apps { -void DIFFUSION3DPA::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { +void DIFFUSION3DPA::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); switch (vid) { diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index a0380b9cc..3d3b040df 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -18,7 +18,7 @@ namespace rajaperf { namespace apps { -void DIFFUSION3DPA::runSeqVariant(VariantID vid, size_t /*tid*/) { +void DIFFUSION3DPA::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); DIFFUSION3DPA_DATA_SETUP; diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp index 6a6a575cc..915f324c1 100644 --- a/src/apps/DIFFUSION3DPA.cpp +++ b/src/apps/DIFFUSION3DPA.cpp @@ -71,7 +71,7 @@ DIFFUSION3DPA::~DIFFUSION3DPA() { } -void DIFFUSION3DPA::setUp(VariantID vid, size_t /*tid*/) +void DIFFUSION3DPA::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_B, int(DPA_Q1D*DPA_D1D), Real_type(1.0), vid); @@ -81,12 +81,12 @@ void DIFFUSION3DPA::setUp(VariantID vid, size_t /*tid*/) allocAndInitDataConst(m_Y, int(DPA_D1D*DPA_D1D*DPA_D1D*m_NE), Real_type(0.0), vid); } -void DIFFUSION3DPA::updateChecksum(VariantID vid, size_t tid) +void DIFFUSION3DPA::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_Y, DPA_D1D*DPA_D1D*DPA_D1D*m_NE); + checksum[vid][tune_idx] += calcChecksum(m_Y, DPA_D1D*DPA_D1D*DPA_D1D*m_NE); } -void DIFFUSION3DPA::tearDown(VariantID vid, size_t /*tid*/) +void DIFFUSION3DPA::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index 2426f2f0b..59f4c656f 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -488,15 +488,15 @@ class DIFFUSION3DPA : public KernelBase ~DIFFUSION3DPA(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); private: static const size_t default_gpu_block_size = DPA_Q1D * DPA_Q1D * DPA_Q1D; diff --git a/src/apps/ENERGY-Cuda.cpp b/src/apps/ENERGY-Cuda.cpp index 36f8d75b3..2d334e4f1 100644 --- a/src/apps/ENERGY-Cuda.cpp +++ b/src/apps/ENERGY-Cuda.cpp @@ -268,13 +268,13 @@ void ENERGY::runCudaVariantImpl(VariantID vid) } } -void ENERGY::runCudaVariant(VariantID vid, size_t tid) +void ENERGY::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/apps/ENERGY-Hip.cpp b/src/apps/ENERGY-Hip.cpp index 8154a4fde..1c3fcd2d3 100644 --- a/src/apps/ENERGY-Hip.cpp +++ b/src/apps/ENERGY-Hip.cpp @@ -262,13 +262,13 @@ void ENERGY::runHipVariantImpl(VariantID vid) } } -void ENERGY::runHipVariant(VariantID vid, size_t tid) +void ENERGY::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/apps/ENERGY-OMP.cpp b/src/apps/ENERGY-OMP.cpp index 1bc2dab05..c617a6e0d 100644 --- a/src/apps/ENERGY-OMP.cpp +++ b/src/apps/ENERGY-OMP.cpp @@ -18,7 +18,7 @@ namespace apps { -void ENERGY::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void ENERGY::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/ENERGY-OMPTarget.cpp b/src/apps/ENERGY-OMPTarget.cpp index b55a89180..97b53d6d4 100644 --- a/src/apps/ENERGY-OMPTarget.cpp +++ b/src/apps/ENERGY-OMPTarget.cpp @@ -65,7 +65,7 @@ namespace apps deallocOpenMPDeviceData(qq_old, did); \ deallocOpenMPDeviceData(vnewc, did); -void ENERGY::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void ENERGY::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/ENERGY-Seq.cpp b/src/apps/ENERGY-Seq.cpp index f08fd1c1d..b24f08a9b 100644 --- a/src/apps/ENERGY-Seq.cpp +++ b/src/apps/ENERGY-Seq.cpp @@ -18,7 +18,7 @@ namespace apps { -void ENERGY::runSeqVariant(VariantID vid, size_t /*tid*/) +void ENERGY::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp index 449258794..303a6e8b8 100644 --- a/src/apps/ENERGY.cpp +++ b/src/apps/ENERGY.cpp @@ -68,7 +68,7 @@ ENERGY::~ENERGY() { } -void ENERGY::setUp(VariantID vid, size_t /*tid*/) +void ENERGY::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_e_new, getActualProblemSize(), 0.0, vid); allocAndInitData(m_e_old, getActualProblemSize(), vid); @@ -92,13 +92,13 @@ void ENERGY::setUp(VariantID vid, size_t /*tid*/) initData(m_q_cut); } -void ENERGY::updateChecksum(VariantID vid, size_t tid) +void ENERGY::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_e_new, getActualProblemSize()); - checksum[vid][tid] += calcChecksum(m_q_new, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_e_new, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_q_new, getActualProblemSize()); } -void ENERGY::tearDown(VariantID vid, size_t /*tid*/) +void ENERGY::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; diff --git a/src/apps/ENERGY.hpp b/src/apps/ENERGY.hpp index 0e7cbcdeb..d2fcc9d88 100644 --- a/src/apps/ENERGY.hpp +++ b/src/apps/ENERGY.hpp @@ -194,15 +194,15 @@ class ENERGY : public KernelBase ~ENERGY(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/apps/FIR-Cuda.cpp b/src/apps/FIR-Cuda.cpp index 803ade76d..bba80e60a 100644 --- a/src/apps/FIR-Cuda.cpp +++ b/src/apps/FIR-Cuda.cpp @@ -147,13 +147,13 @@ void FIR::runCudaVariantImpl(VariantID vid) } } -void FIR::runCudaVariant(VariantID vid, size_t tid) +void FIR::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/apps/FIR-Hip.cpp b/src/apps/FIR-Hip.cpp index a38e4c428..32d65bdf1 100644 --- a/src/apps/FIR-Hip.cpp +++ b/src/apps/FIR-Hip.cpp @@ -147,13 +147,13 @@ void FIR::runHipVariantImpl(VariantID vid) } } -void FIR::runHipVariant(VariantID vid, size_t tid) +void FIR::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/apps/FIR-OMP.cpp b/src/apps/FIR-OMP.cpp index c99e755c9..195cb22d3 100644 --- a/src/apps/FIR-OMP.cpp +++ b/src/apps/FIR-OMP.cpp @@ -19,7 +19,7 @@ namespace apps { -void FIR::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void FIR::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/FIR-OMPTarget.cpp b/src/apps/FIR-OMPTarget.cpp index 8de6e5b25..7f2f04265 100644 --- a/src/apps/FIR-OMPTarget.cpp +++ b/src/apps/FIR-OMPTarget.cpp @@ -46,7 +46,7 @@ namespace apps deallocOpenMPDeviceData(coeff, did); -void FIR::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void FIR::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/FIR-Seq.cpp b/src/apps/FIR-Seq.cpp index bd5a5bda1..69b58c9c8 100644 --- a/src/apps/FIR-Seq.cpp +++ b/src/apps/FIR-Seq.cpp @@ -19,7 +19,7 @@ namespace apps { -void FIR::runSeqVariant(VariantID vid, size_t /*tid*/) +void FIR::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp index ce5904dcf..3589debaf 100644 --- a/src/apps/FIR.cpp +++ b/src/apps/FIR.cpp @@ -62,18 +62,18 @@ FIR::~FIR() { } -void FIR::setUp(VariantID vid, size_t /*tid*/) +void FIR::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitData(m_in, getActualProblemSize(), vid); allocAndInitDataConst(m_out, getActualProblemSize(), 0.0, vid); } -void FIR::updateChecksum(VariantID vid, size_t tid) +void FIR::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_out, getActualProblemSize(), checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_out, getActualProblemSize(), checksum_scale_factor ); } -void FIR::tearDown(VariantID vid, size_t /*tid*/) +void FIR::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; diff --git a/src/apps/FIR.hpp b/src/apps/FIR.hpp index 21e915705..0d643582b 100644 --- a/src/apps/FIR.hpp +++ b/src/apps/FIR.hpp @@ -69,15 +69,15 @@ class FIR : public KernelBase ~FIR(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/apps/HALOEXCHANGE-Cuda.cpp b/src/apps/HALOEXCHANGE-Cuda.cpp index 5ab04cb8f..a297d1885 100644 --- a/src/apps/HALOEXCHANGE-Cuda.cpp +++ b/src/apps/HALOEXCHANGE-Cuda.cpp @@ -169,13 +169,13 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) } } -void HALOEXCHANGE::runCudaVariant(VariantID vid, size_t tid) +void HALOEXCHANGE::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/apps/HALOEXCHANGE-Hip.cpp b/src/apps/HALOEXCHANGE-Hip.cpp index fdd851716..a13f36619 100644 --- a/src/apps/HALOEXCHANGE-Hip.cpp +++ b/src/apps/HALOEXCHANGE-Hip.cpp @@ -171,13 +171,13 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) } } -void HALOEXCHANGE::runHipVariant(VariantID vid, size_t tid) +void HALOEXCHANGE::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/apps/HALOEXCHANGE-OMP.cpp b/src/apps/HALOEXCHANGE-OMP.cpp index be757cf4c..e24d5c294 100644 --- a/src/apps/HALOEXCHANGE-OMP.cpp +++ b/src/apps/HALOEXCHANGE-OMP.cpp @@ -18,7 +18,7 @@ namespace apps { -void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/HALOEXCHANGE-OMPTarget.cpp b/src/apps/HALOEXCHANGE-OMPTarget.cpp index df1453bb7..5bab9e060 100644 --- a/src/apps/HALOEXCHANGE-OMPTarget.cpp +++ b/src/apps/HALOEXCHANGE-OMPTarget.cpp @@ -51,7 +51,7 @@ namespace apps } -void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/HALOEXCHANGE-Seq.cpp b/src/apps/HALOEXCHANGE-Seq.cpp index 16a40dc56..65405741a 100644 --- a/src/apps/HALOEXCHANGE-Seq.cpp +++ b/src/apps/HALOEXCHANGE-Seq.cpp @@ -18,7 +18,7 @@ namespace apps { -void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t /*tid*/) +void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/HALOEXCHANGE.cpp b/src/apps/HALOEXCHANGE.cpp index 1d4456657..3f96ee337 100644 --- a/src/apps/HALOEXCHANGE.cpp +++ b/src/apps/HALOEXCHANGE.cpp @@ -104,7 +104,7 @@ HALOEXCHANGE::~HALOEXCHANGE() { } -void HALOEXCHANGE::setUp(VariantID vid, size_t /*tid*/) +void HALOEXCHANGE::setUp(VariantID vid, size_t /*tune_idx*/) { m_vars.resize(m_num_vars, nullptr); for (Index_type v = 0; v < m_num_vars; ++v) { @@ -132,14 +132,14 @@ void HALOEXCHANGE::setUp(VariantID vid, size_t /*tid*/) } } -void HALOEXCHANGE::updateChecksum(VariantID vid, size_t tid) +void HALOEXCHANGE::updateChecksum(VariantID vid, size_t tune_idx) { for (Real_ptr var : m_vars) { - checksum[vid][tid] += calcChecksum(var, m_var_size); + checksum[vid][tune_idx] += calcChecksum(var, m_var_size); } } -void HALOEXCHANGE::tearDown(VariantID vid, size_t /*tid*/) +void HALOEXCHANGE::tearDown(VariantID vid, size_t /*tune_idx*/) { for (int l = 0; l < s_num_neighbors; ++l) { deallocData(m_buffers[l]); diff --git a/src/apps/HALOEXCHANGE.hpp b/src/apps/HALOEXCHANGE.hpp index 194674df7..705eed88c 100644 --- a/src/apps/HALOEXCHANGE.hpp +++ b/src/apps/HALOEXCHANGE.hpp @@ -84,15 +84,15 @@ class HALOEXCHANGE : public KernelBase ~HALOEXCHANGE(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp index 82b29c966..0aad4f3d0 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp @@ -270,13 +270,13 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) } } -void HALOEXCHANGE_FUSED::runCudaVariant(VariantID vid, size_t tid) +void HALOEXCHANGE_FUSED::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp index 28dfa404c..d9809d37d 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp @@ -273,13 +273,13 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) } } -void HALOEXCHANGE_FUSED::runHipVariant(VariantID vid, size_t tid) +void HALOEXCHANGE_FUSED::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp b/src/apps/HALOEXCHANGE_FUSED-OMP.cpp index 6652eca52..626074e02 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-OMP.cpp @@ -18,7 +18,7 @@ namespace apps { -void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp index a0b2a2cb5..674cc73d7 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp @@ -87,7 +87,7 @@ namespace apps delete[] h_unpack_ptrs; -void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp b/src/apps/HALOEXCHANGE_FUSED-Seq.cpp index 8d0cf31bc..fc1213b44 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Seq.cpp @@ -18,7 +18,7 @@ namespace apps { -void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t /*tid*/) +void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/HALOEXCHANGE_FUSED.cpp b/src/apps/HALOEXCHANGE_FUSED.cpp index 2be36b469..f95be24fb 100644 --- a/src/apps/HALOEXCHANGE_FUSED.cpp +++ b/src/apps/HALOEXCHANGE_FUSED.cpp @@ -104,7 +104,7 @@ HALOEXCHANGE_FUSED::~HALOEXCHANGE_FUSED() { } -void HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t /*tid*/) +void HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t /*tune_idx*/) { m_vars.resize(m_num_vars, nullptr); for (Index_type v = 0; v < m_num_vars; ++v) { @@ -132,14 +132,14 @@ void HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t /*tid*/) } } -void HALOEXCHANGE_FUSED::updateChecksum(VariantID vid, size_t tid) +void HALOEXCHANGE_FUSED::updateChecksum(VariantID vid, size_t tune_idx) { for (Real_ptr var : m_vars) { - checksum[vid][tid] += calcChecksum(var, m_var_size); + checksum[vid][tune_idx] += calcChecksum(var, m_var_size); } } -void HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t /*tid*/) +void HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t /*tune_idx*/) { for (int l = 0; l < s_num_neighbors; ++l) { deallocData(m_buffers[l]); diff --git a/src/apps/HALOEXCHANGE_FUSED.hpp b/src/apps/HALOEXCHANGE_FUSED.hpp index ab8471502..dcb8a701a 100644 --- a/src/apps/HALOEXCHANGE_FUSED.hpp +++ b/src/apps/HALOEXCHANGE_FUSED.hpp @@ -128,15 +128,15 @@ class HALOEXCHANGE_FUSED : public KernelBase ~HALOEXCHANGE_FUSED(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/apps/LTIMES-Cuda.cpp b/src/apps/LTIMES-Cuda.cpp index 3144df34b..e54d76a13 100644 --- a/src/apps/LTIMES-Cuda.cpp +++ b/src/apps/LTIMES-Cuda.cpp @@ -188,13 +188,13 @@ void LTIMES::runCudaVariantImpl(VariantID vid) } } -void LTIMES::runCudaVariant(VariantID vid, size_t tid) +void LTIMES::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/apps/LTIMES-Hip.cpp b/src/apps/LTIMES-Hip.cpp index 1eccc49b6..6a8c72917 100644 --- a/src/apps/LTIMES-Hip.cpp +++ b/src/apps/LTIMES-Hip.cpp @@ -190,13 +190,13 @@ void LTIMES::runHipVariantImpl(VariantID vid) } } -void LTIMES::runHipVariant(VariantID vid, size_t tid) +void LTIMES::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/apps/LTIMES-OMP.cpp b/src/apps/LTIMES-OMP.cpp index 37c4d6dc3..995351ab4 100644 --- a/src/apps/LTIMES-OMP.cpp +++ b/src/apps/LTIMES-OMP.cpp @@ -18,7 +18,7 @@ namespace apps { -void LTIMES::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void LTIMES::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/LTIMES-OMPTarget.cpp b/src/apps/LTIMES-OMPTarget.cpp index b9031ca29..f641082b4 100644 --- a/src/apps/LTIMES-OMPTarget.cpp +++ b/src/apps/LTIMES-OMPTarget.cpp @@ -36,7 +36,7 @@ namespace apps deallocOpenMPDeviceData(psidat, did); -void LTIMES::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void LTIMES::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/LTIMES-Seq.cpp b/src/apps/LTIMES-Seq.cpp index 5c6ba4fb9..2a27cadf5 100644 --- a/src/apps/LTIMES-Seq.cpp +++ b/src/apps/LTIMES-Seq.cpp @@ -18,7 +18,7 @@ namespace apps { -void LTIMES::runSeqVariant(VariantID vid, size_t /*tid*/) +void LTIMES::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp index 5354ab7fc..29f26896c 100644 --- a/src/apps/LTIMES.cpp +++ b/src/apps/LTIMES.cpp @@ -83,19 +83,19 @@ LTIMES::~LTIMES() { } -void LTIMES::setUp(VariantID vid, size_t /*tid*/) +void LTIMES::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_phidat, int(m_philen), Real_type(0.0), vid); allocAndInitData(m_elldat, int(m_elllen), vid); allocAndInitData(m_psidat, int(m_psilen), vid); } -void LTIMES::updateChecksum(VariantID vid, size_t tid) +void LTIMES::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_phidat, m_philen, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_phidat, m_philen, checksum_scale_factor ); } -void LTIMES::tearDown(VariantID vid, size_t /*tid*/) +void LTIMES::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; diff --git a/src/apps/LTIMES.hpp b/src/apps/LTIMES.hpp index c625fa53a..a8d488a93 100644 --- a/src/apps/LTIMES.hpp +++ b/src/apps/LTIMES.hpp @@ -107,15 +107,15 @@ class LTIMES : public KernelBase ~LTIMES(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/apps/LTIMES_NOVIEW-Cuda.cpp b/src/apps/LTIMES_NOVIEW-Cuda.cpp index 04cd8b09b..769ede624 100644 --- a/src/apps/LTIMES_NOVIEW-Cuda.cpp +++ b/src/apps/LTIMES_NOVIEW-Cuda.cpp @@ -186,13 +186,13 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid) } } -void LTIMES_NOVIEW::runCudaVariant(VariantID vid, size_t tid) +void LTIMES_NOVIEW::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/apps/LTIMES_NOVIEW-Hip.cpp b/src/apps/LTIMES_NOVIEW-Hip.cpp index e27e7f953..ec9256ae0 100644 --- a/src/apps/LTIMES_NOVIEW-Hip.cpp +++ b/src/apps/LTIMES_NOVIEW-Hip.cpp @@ -189,13 +189,13 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid) } } -void LTIMES_NOVIEW::runHipVariant(VariantID vid, size_t tid) +void LTIMES_NOVIEW::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/apps/LTIMES_NOVIEW-OMP.cpp b/src/apps/LTIMES_NOVIEW-OMP.cpp index e654d2a59..cc1f95168 100644 --- a/src/apps/LTIMES_NOVIEW-OMP.cpp +++ b/src/apps/LTIMES_NOVIEW-OMP.cpp @@ -18,7 +18,7 @@ namespace apps { -void LTIMES_NOVIEW::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void LTIMES_NOVIEW::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/LTIMES_NOVIEW-OMPTarget.cpp b/src/apps/LTIMES_NOVIEW-OMPTarget.cpp index 15e60d346..85ae2662e 100644 --- a/src/apps/LTIMES_NOVIEW-OMPTarget.cpp +++ b/src/apps/LTIMES_NOVIEW-OMPTarget.cpp @@ -36,7 +36,7 @@ namespace apps deallocOpenMPDeviceData(psidat, did); -void LTIMES_NOVIEW::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void LTIMES_NOVIEW::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/LTIMES_NOVIEW-Seq.cpp b/src/apps/LTIMES_NOVIEW-Seq.cpp index 4739c271c..c1bff6c2a 100644 --- a/src/apps/LTIMES_NOVIEW-Seq.cpp +++ b/src/apps/LTIMES_NOVIEW-Seq.cpp @@ -18,7 +18,7 @@ namespace apps { -void LTIMES_NOVIEW::runSeqVariant(VariantID vid, size_t /*tid*/) +void LTIMES_NOVIEW::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp index c2f224492..db4c99ee4 100644 --- a/src/apps/LTIMES_NOVIEW.cpp +++ b/src/apps/LTIMES_NOVIEW.cpp @@ -82,19 +82,19 @@ LTIMES_NOVIEW::~LTIMES_NOVIEW() { } -void LTIMES_NOVIEW::setUp(VariantID vid, size_t /*tid*/) +void LTIMES_NOVIEW::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_phidat, int(m_philen), Real_type(0.0), vid); allocAndInitData(m_elldat, int(m_elllen), vid); allocAndInitData(m_psidat, int(m_psilen), vid); } -void LTIMES_NOVIEW::updateChecksum(VariantID vid, size_t tid) +void LTIMES_NOVIEW::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_phidat, m_philen, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_phidat, m_philen, checksum_scale_factor ); } -void LTIMES_NOVIEW::tearDown(VariantID vid, size_t /*tid*/) +void LTIMES_NOVIEW::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; diff --git a/src/apps/LTIMES_NOVIEW.hpp b/src/apps/LTIMES_NOVIEW.hpp index 403e0fdee..98f5733f6 100644 --- a/src/apps/LTIMES_NOVIEW.hpp +++ b/src/apps/LTIMES_NOVIEW.hpp @@ -57,15 +57,15 @@ class LTIMES_NOVIEW : public KernelBase ~LTIMES_NOVIEW(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp index b93c32f81..69069d780 100644 --- a/src/apps/MASS3DPA-Cuda.cpp +++ b/src/apps/MASS3DPA-Cuda.cpp @@ -102,7 +102,7 @@ __global__ void Mass3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt, } } -void MASS3DPA::runCudaVariant(VariantID vid, size_t tid) { +void MASS3DPA::runCudaVariant(VariantID vid, size_t tune_idx) { const Index_type run_reps = getRunReps(); MASS3DPA_DATA_SETUP; diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index 6946d201a..09ce997ec 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -102,7 +102,7 @@ __global__ void Mass3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt, } } -void MASS3DPA::runHipVariant(VariantID vid, size_t tid) { +void MASS3DPA::runHipVariant(VariantID vid, size_t tune_idx) { const Index_type run_reps = getRunReps(); MASS3DPA_DATA_SETUP; diff --git a/src/apps/MASS3DPA-OMP.cpp b/src/apps/MASS3DPA-OMP.cpp index 0db2017ec..f7660f52c 100644 --- a/src/apps/MASS3DPA-OMP.cpp +++ b/src/apps/MASS3DPA-OMP.cpp @@ -19,7 +19,7 @@ namespace rajaperf { namespace apps { -void MASS3DPA::runOpenMPVariant(VariantID vid, size_t /*tid*/) { +void MASS3DPA::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/MASS3DPA-OMPTarget.cpp b/src/apps/MASS3DPA-OMPTarget.cpp index 11f227500..674bee300 100644 --- a/src/apps/MASS3DPA-OMPTarget.cpp +++ b/src/apps/MASS3DPA-OMPTarget.cpp @@ -20,7 +20,7 @@ namespace rajaperf { namespace apps { -void MASS3DPA::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { +void MASS3DPA::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); switch (vid) { diff --git a/src/apps/MASS3DPA-Seq.cpp b/src/apps/MASS3DPA-Seq.cpp index 53fe83b2a..16e75ec61 100644 --- a/src/apps/MASS3DPA-Seq.cpp +++ b/src/apps/MASS3DPA-Seq.cpp @@ -19,7 +19,7 @@ namespace rajaperf { namespace apps { -void MASS3DPA::runSeqVariant(VariantID vid, size_t /*tid*/) { +void MASS3DPA::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); MASS3DPA_DATA_SETUP; diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp index 97aeabf24..9e3069e62 100644 --- a/src/apps/MASS3DPA.cpp +++ b/src/apps/MASS3DPA.cpp @@ -67,7 +67,7 @@ MASS3DPA::~MASS3DPA() { } -void MASS3DPA::setUp(VariantID vid, size_t /*tid*/) +void MASS3DPA::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_B, int(MPA_Q1D*MPA_D1D), Real_type(1.0), vid); @@ -77,12 +77,12 @@ void MASS3DPA::setUp(VariantID vid, size_t /*tid*/) allocAndInitDataConst(m_Y, int(MPA_D1D*MPA_D1D*MPA_D1D*m_NE), Real_type(0.0), vid); } -void MASS3DPA::updateChecksum(VariantID vid, size_t tid) +void MASS3DPA::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_Y, MPA_D1D*MPA_D1D*MPA_D1D*m_NE); + checksum[vid][tune_idx] += calcChecksum(m_Y, MPA_D1D*MPA_D1D*MPA_D1D*m_NE); } -void MASS3DPA::tearDown(VariantID vid, size_t /*tid*/) +void MASS3DPA::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; diff --git a/src/apps/MASS3DPA.hpp b/src/apps/MASS3DPA.hpp index d3b68492b..fa429a510 100644 --- a/src/apps/MASS3DPA.hpp +++ b/src/apps/MASS3DPA.hpp @@ -368,15 +368,15 @@ class MASS3DPA : public KernelBase ~MASS3DPA(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); private: static const size_t default_gpu_block_size = MPA_Q1D * MPA_Q1D; diff --git a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp index ff6db297c..b4a2fc771 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp @@ -113,13 +113,13 @@ void NODAL_ACCUMULATION_3D::runCudaVariantImpl(VariantID vid) } } -void NODAL_ACCUMULATION_3D::runCudaVariant(VariantID vid, size_t tid) +void NODAL_ACCUMULATION_3D::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp index fded41cc3..c6d8a14a9 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp @@ -113,13 +113,13 @@ void NODAL_ACCUMULATION_3D::runHipVariantImpl(VariantID vid) } } -void NODAL_ACCUMULATION_3D::runHipVariant(VariantID vid, size_t tid) +void NODAL_ACCUMULATION_3D::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp b/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp index 20501faeb..94c690742 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp @@ -20,7 +20,7 @@ namespace apps { -void NODAL_ACCUMULATION_3D::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void NODAL_ACCUMULATION_3D::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp b/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp index 6f8a97a3d..45c84c096 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp @@ -43,7 +43,7 @@ namespace apps deallocOpenMPDeviceData(real_zones, did); -void NODAL_ACCUMULATION_3D::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void NODAL_ACCUMULATION_3D::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp b/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp index 297bf0140..2fdfa0c64 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp @@ -20,7 +20,7 @@ namespace apps { -void NODAL_ACCUMULATION_3D::runSeqVariant(VariantID vid, size_t /*tid*/) +void NODAL_ACCUMULATION_3D::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/NODAL_ACCUMULATION_3D.cpp b/src/apps/NODAL_ACCUMULATION_3D.cpp index 87a3b3639..895e1e7ba 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D.cpp @@ -74,18 +74,18 @@ NODAL_ACCUMULATION_3D::~NODAL_ACCUMULATION_3D() delete m_domain; } -void NODAL_ACCUMULATION_3D::setUp(VariantID vid, size_t /*tid*/) +void NODAL_ACCUMULATION_3D::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_x, m_nodal_array_length, 0.0, vid); allocAndInitDataConst(m_vol, m_zonal_array_length, 1.0, vid); } -void NODAL_ACCUMULATION_3D::updateChecksum(VariantID vid, size_t tid) +void NODAL_ACCUMULATION_3D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid].at(tid) += calcChecksum(m_x, m_nodal_array_length, checksum_scale_factor ); + checksum[vid].at(tune_idx) += calcChecksum(m_x, m_nodal_array_length, checksum_scale_factor ); } -void NODAL_ACCUMULATION_3D::tearDown(VariantID vid, size_t /*tid*/) +void NODAL_ACCUMULATION_3D::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; diff --git a/src/apps/NODAL_ACCUMULATION_3D.hpp b/src/apps/NODAL_ACCUMULATION_3D.hpp index 482c16b10..049ca1e78 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.hpp +++ b/src/apps/NODAL_ACCUMULATION_3D.hpp @@ -86,15 +86,15 @@ class NODAL_ACCUMULATION_3D : public KernelBase ~NODAL_ACCUMULATION_3D(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/apps/PRESSURE-Cuda.cpp b/src/apps/PRESSURE-Cuda.cpp index 5f946ea8b..7fc9fc33a 100644 --- a/src/apps/PRESSURE-Cuda.cpp +++ b/src/apps/PRESSURE-Cuda.cpp @@ -136,13 +136,13 @@ void PRESSURE::runCudaVariantImpl(VariantID vid) } } -void PRESSURE::runCudaVariant(VariantID vid, size_t tid) +void PRESSURE::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/apps/PRESSURE-Hip.cpp b/src/apps/PRESSURE-Hip.cpp index a5b870539..8eb9abbf1 100644 --- a/src/apps/PRESSURE-Hip.cpp +++ b/src/apps/PRESSURE-Hip.cpp @@ -129,13 +129,13 @@ void PRESSURE::runHipVariantImpl(VariantID vid) } } -void PRESSURE::runHipVariant(VariantID vid, size_t tid) +void PRESSURE::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/apps/PRESSURE-OMP.cpp b/src/apps/PRESSURE-OMP.cpp index 81a3a918c..9f9a10047 100644 --- a/src/apps/PRESSURE-OMP.cpp +++ b/src/apps/PRESSURE-OMP.cpp @@ -18,7 +18,7 @@ namespace apps { -void PRESSURE::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void PRESSURE::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/PRESSURE-OMPTarget.cpp b/src/apps/PRESSURE-OMPTarget.cpp index 41a3df1b8..10d367e99 100644 --- a/src/apps/PRESSURE-OMPTarget.cpp +++ b/src/apps/PRESSURE-OMPTarget.cpp @@ -45,7 +45,7 @@ namespace apps deallocOpenMPDeviceData(vnewc, did); -void PRESSURE::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void PRESSURE::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/PRESSURE-Seq.cpp b/src/apps/PRESSURE-Seq.cpp index 096054107..64a5ac85e 100644 --- a/src/apps/PRESSURE-Seq.cpp +++ b/src/apps/PRESSURE-Seq.cpp @@ -18,7 +18,7 @@ namespace apps { -void PRESSURE::runSeqVariant(VariantID vid, size_t /*tid*/) +void PRESSURE::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp index fdd2d8736..e47e061cd 100644 --- a/src/apps/PRESSURE.cpp +++ b/src/apps/PRESSURE.cpp @@ -58,7 +58,7 @@ PRESSURE::~PRESSURE() { } -void PRESSURE::setUp(VariantID vid, size_t /*tid*/) +void PRESSURE::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitData(m_compression, getActualProblemSize(), vid); allocAndInitData(m_bvc, getActualProblemSize(), vid); @@ -72,12 +72,12 @@ void PRESSURE::setUp(VariantID vid, size_t /*tid*/) initData(m_eosvmax); } -void PRESSURE::updateChecksum(VariantID vid, size_t tid) +void PRESSURE::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_p_new, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_p_new, getActualProblemSize()); } -void PRESSURE::tearDown(VariantID vid, size_t /*tid*/) +void PRESSURE::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; diff --git a/src/apps/PRESSURE.hpp b/src/apps/PRESSURE.hpp index 442df7cc8..129972d46 100644 --- a/src/apps/PRESSURE.hpp +++ b/src/apps/PRESSURE.hpp @@ -63,15 +63,15 @@ class PRESSURE : public KernelBase ~PRESSURE(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/apps/VOL3D-Cuda.cpp b/src/apps/VOL3D-Cuda.cpp index 1ab187d7a..8e0ddf01d 100644 --- a/src/apps/VOL3D-Cuda.cpp +++ b/src/apps/VOL3D-Cuda.cpp @@ -123,13 +123,13 @@ void VOL3D::runCudaVariantImpl(VariantID vid) } } -void VOL3D::runCudaVariant(VariantID vid, size_t tid) +void VOL3D::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/apps/VOL3D-Hip.cpp b/src/apps/VOL3D-Hip.cpp index c4e0dc21f..551a472db 100644 --- a/src/apps/VOL3D-Hip.cpp +++ b/src/apps/VOL3D-Hip.cpp @@ -123,13 +123,13 @@ void VOL3D::runHipVariantImpl(VariantID vid) } } -void VOL3D::runHipVariant(VariantID vid, size_t tid) +void VOL3D::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/apps/VOL3D-OMP.cpp b/src/apps/VOL3D-OMP.cpp index 47211d8fe..2be10a75b 100644 --- a/src/apps/VOL3D-OMP.cpp +++ b/src/apps/VOL3D-OMP.cpp @@ -20,7 +20,7 @@ namespace apps { -void VOL3D::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void VOL3D::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/VOL3D-OMPTarget.cpp b/src/apps/VOL3D-OMPTarget.cpp index aaecb3b1b..8c9a20cb3 100644 --- a/src/apps/VOL3D-OMPTarget.cpp +++ b/src/apps/VOL3D-OMPTarget.cpp @@ -45,7 +45,7 @@ namespace apps deallocOpenMPDeviceData(vol, did); -void VOL3D::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void VOL3D::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = m_domain->fpz; diff --git a/src/apps/VOL3D-Seq.cpp b/src/apps/VOL3D-Seq.cpp index 4649833c8..2435e3030 100644 --- a/src/apps/VOL3D-Seq.cpp +++ b/src/apps/VOL3D-Seq.cpp @@ -20,7 +20,7 @@ namespace apps { -void VOL3D::runSeqVariant(VariantID vid, size_t /*tid*/) +void VOL3D::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = m_domain->fpz; diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp index 7e1a0a825..4555e261c 100644 --- a/src/apps/VOL3D.cpp +++ b/src/apps/VOL3D.cpp @@ -71,7 +71,7 @@ VOL3D::~VOL3D() delete m_domain; } -void VOL3D::setUp(VariantID vid, size_t /*tid*/) +void VOL3D::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_x, m_array_length, 0.0, vid); allocAndInitDataConst(m_y, m_array_length, 0.0, vid); @@ -87,12 +87,12 @@ void VOL3D::setUp(VariantID vid, size_t /*tid*/) m_vnormq = 0.083333333333333333; /* vnormq = 1/12 */ } -void VOL3D::updateChecksum(VariantID vid, size_t tid) +void VOL3D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_vol, m_array_length, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_vol, m_array_length, checksum_scale_factor ); } -void VOL3D::tearDown(VariantID vid, size_t /*tid*/) +void VOL3D::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; diff --git a/src/apps/VOL3D.hpp b/src/apps/VOL3D.hpp index 73934f66c..e11b3caf1 100644 --- a/src/apps/VOL3D.hpp +++ b/src/apps/VOL3D.hpp @@ -160,15 +160,15 @@ class VOL3D : public KernelBase ~VOL3D(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/apps/WIP-COUPLE.cpp b/src/apps/WIP-COUPLE.cpp index a91c772d4..3083bca33 100644 --- a/src/apps/WIP-COUPLE.cpp +++ b/src/apps/WIP-COUPLE.cpp @@ -58,7 +58,7 @@ COUPLE::~COUPLE() delete m_domain; } -void COUPLE::setUp(VariantID vid, size_t /*tid*/) +void COUPLE::setUp(VariantID vid, size_t /*tune_idx*/) { Index_type max_loop_index = m_domain->lrn; @@ -80,9 +80,9 @@ void COUPLE::setUp(VariantID vid, size_t /*tid*/) m_ireal = Complex_type(0.0, 1.0); } -void COUPLE::runKernel(VariantID vid, size_t tid) +void COUPLE::runKernel(VariantID vid, size_t tune_idx) { - RAJA_UNUSED_VAR(tid); + RAJA_UNUSED_VAR(tune_idx); const Index_type run_reps = getRunReps(); COUPLE_DATA_SETUP; @@ -159,7 +159,7 @@ void COUPLE::runKernel(VariantID vid, size_t tid) case Base_OpenMPTarget : case RAJA_OpenMPTarget : { - runOpenMPTargetVariant(vid, tid); + runOpenMPTargetVariant(vid, tune_idx); break; } #endif @@ -168,7 +168,7 @@ void COUPLE::runKernel(VariantID vid, size_t tid) case Base_CUDA : case RAJA_CUDA : { - runCudaVariant(vid, tid); + runCudaVariant(vid, tune_idx); break; } #endif @@ -180,16 +180,16 @@ void COUPLE::runKernel(VariantID vid, size_t tid) } } -void COUPLE::updateChecksum(VariantID vid, size_t tid) +void COUPLE::updateChecksum(VariantID vid, size_t tune_idx) { Index_type max_loop_index = m_domain->lrn; - checksum[vid][tid] += calcChecksum(m_t0, max_loop_index); - checksum[vid][tid] += calcChecksum(m_t1, max_loop_index); - checksum[vid][tid] += calcChecksum(m_t2, max_loop_index); + checksum[vid][tune_idx] += calcChecksum(m_t0, max_loop_index); + checksum[vid][tune_idx] += calcChecksum(m_t1, max_loop_index); + checksum[vid][tune_idx] += calcChecksum(m_t2, max_loop_index); } -void COUPLE::tearDown(VariantID vid, size_t /*tid*/) +void COUPLE::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; diff --git a/src/apps/WIP-COUPLE.hpp b/src/apps/WIP-COUPLE.hpp index 5e96fb81f..0c7509096 100644 --- a/src/apps/WIP-COUPLE.hpp +++ b/src/apps/WIP-COUPLE.hpp @@ -161,16 +161,16 @@ class COUPLE : public KernelBase ~COUPLE(); - void setUp(VariantID vid, size_t tid); - void runKernel(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t /*tid*/) {(void) vid;} - void runOpenMPVariant(VariantID vid, size_t /*tid*/) {(void) vid;} - void runCudaVariant(VariantID vid, size_t /*tid*/) {(void) vid;} - void runHipVariant(VariantID vid, size_t /*tid*/) {(void) vid;} - void runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) {(void) vid;} + void setUp(VariantID vid, size_t tune_idx); + void runKernel(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t /*tune_idx*/) {(void) vid;} + void runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) {(void) vid;} + void runCudaVariant(VariantID vid, size_t /*tune_idx*/) {(void) vid;} + void runHipVariant(VariantID vid, size_t /*tune_idx*/) {(void) vid;} + void runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) {(void) vid;} private: Complex_ptr m_t0; diff --git a/src/basic/DAXPY-Cuda.cpp b/src/basic/DAXPY-Cuda.cpp index 2903384f1..ae0f72712 100644 --- a/src/basic/DAXPY-Cuda.cpp +++ b/src/basic/DAXPY-Cuda.cpp @@ -110,13 +110,13 @@ void DAXPY::runCudaVariantImpl(VariantID vid) } } -void DAXPY::runCudaVariant(VariantID vid, size_t tid) +void DAXPY::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/basic/DAXPY-Hip.cpp b/src/basic/DAXPY-Hip.cpp index de8612a09..71f80fba3 100644 --- a/src/basic/DAXPY-Hip.cpp +++ b/src/basic/DAXPY-Hip.cpp @@ -113,13 +113,13 @@ void DAXPY::runHipVariantImpl(VariantID vid) } } -void DAXPY::runHipVariant(VariantID vid, size_t tid) +void DAXPY::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/basic/DAXPY-OMP.cpp b/src/basic/DAXPY-OMP.cpp index 1b6c4fa72..c72f08d43 100644 --- a/src/basic/DAXPY-OMP.cpp +++ b/src/basic/DAXPY-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void DAXPY::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void DAXPY::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/DAXPY-OMPTarget.cpp b/src/basic/DAXPY-OMPTarget.cpp index e3610f387..479dcca4e 100644 --- a/src/basic/DAXPY-OMPTarget.cpp +++ b/src/basic/DAXPY-OMPTarget.cpp @@ -39,7 +39,7 @@ namespace basic deallocOpenMPDeviceData(y, did); -void DAXPY::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void DAXPY::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/DAXPY-Seq.cpp b/src/basic/DAXPY-Seq.cpp index 6a1882a61..70f0710cb 100644 --- a/src/basic/DAXPY-Seq.cpp +++ b/src/basic/DAXPY-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void DAXPY::runSeqVariant(VariantID vid, size_t /*tid*/) +void DAXPY::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp index 46f6b6210..e6caab998 100644 --- a/src/basic/DAXPY.cpp +++ b/src/basic/DAXPY.cpp @@ -57,19 +57,19 @@ DAXPY::~DAXPY() { } -void DAXPY::setUp(VariantID vid, size_t /*tid*/) +void DAXPY::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_y, getActualProblemSize(), 0.0, vid); allocAndInitData(m_x, getActualProblemSize(), vid); initData(m_a); } -void DAXPY::updateChecksum(VariantID vid, size_t tid) +void DAXPY::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid].at(tid) += calcChecksum(m_y, getActualProblemSize()); + checksum[vid].at(tune_idx) += calcChecksum(m_y, getActualProblemSize()); } -void DAXPY::tearDown(VariantID vid, size_t /*tid*/) +void DAXPY::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_x); diff --git a/src/basic/DAXPY.hpp b/src/basic/DAXPY.hpp index 8296c5336..1d6e3b61b 100644 --- a/src/basic/DAXPY.hpp +++ b/src/basic/DAXPY.hpp @@ -43,15 +43,15 @@ class DAXPY : public KernelBase ~DAXPY(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/basic/DAXPY_ATOMIC-Cuda.cpp b/src/basic/DAXPY_ATOMIC-Cuda.cpp index 9e74116ae..7cbf371c2 100644 --- a/src/basic/DAXPY_ATOMIC-Cuda.cpp +++ b/src/basic/DAXPY_ATOMIC-Cuda.cpp @@ -110,13 +110,13 @@ void DAXPY_ATOMIC::runCudaVariantImpl(VariantID vid) } } -void DAXPY_ATOMIC::runCudaVariant(VariantID vid, size_t tid) +void DAXPY_ATOMIC::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/basic/DAXPY_ATOMIC-Hip.cpp b/src/basic/DAXPY_ATOMIC-Hip.cpp index f29b52217..ff914387b 100644 --- a/src/basic/DAXPY_ATOMIC-Hip.cpp +++ b/src/basic/DAXPY_ATOMIC-Hip.cpp @@ -112,13 +112,13 @@ void DAXPY_ATOMIC::runHipVariantImpl(VariantID vid) } } -void DAXPY_ATOMIC::runHipVariant(VariantID vid, size_t tid) +void DAXPY_ATOMIC::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/basic/DAXPY_ATOMIC-OMP.cpp b/src/basic/DAXPY_ATOMIC-OMP.cpp index eb10a57af..0e57d24d3 100644 --- a/src/basic/DAXPY_ATOMIC-OMP.cpp +++ b/src/basic/DAXPY_ATOMIC-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void DAXPY_ATOMIC::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void DAXPY_ATOMIC::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/DAXPY_ATOMIC-OMPTarget.cpp b/src/basic/DAXPY_ATOMIC-OMPTarget.cpp index 2ac0270dd..adb96dca0 100644 --- a/src/basic/DAXPY_ATOMIC-OMPTarget.cpp +++ b/src/basic/DAXPY_ATOMIC-OMPTarget.cpp @@ -39,7 +39,7 @@ namespace basic deallocOpenMPDeviceData(y, did); -void DAXPY_ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void DAXPY_ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/DAXPY_ATOMIC-Seq.cpp b/src/basic/DAXPY_ATOMIC-Seq.cpp index cf30444f3..40aaac3ff 100644 --- a/src/basic/DAXPY_ATOMIC-Seq.cpp +++ b/src/basic/DAXPY_ATOMIC-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void DAXPY_ATOMIC::runSeqVariant(VariantID vid, size_t /*tid*/) +void DAXPY_ATOMIC::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/DAXPY_ATOMIC.cpp b/src/basic/DAXPY_ATOMIC.cpp index 66b22d1b6..7e5d40c71 100644 --- a/src/basic/DAXPY_ATOMIC.cpp +++ b/src/basic/DAXPY_ATOMIC.cpp @@ -57,19 +57,19 @@ DAXPY_ATOMIC::~DAXPY_ATOMIC() { } -void DAXPY_ATOMIC::setUp(VariantID vid, size_t /*tid*/) +void DAXPY_ATOMIC::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_y, getActualProblemSize(), 0.0, vid); allocAndInitData(m_x, getActualProblemSize(), vid); initData(m_a); } -void DAXPY_ATOMIC::updateChecksum(VariantID vid, size_t tid) +void DAXPY_ATOMIC::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid].at(tid) += calcChecksum(m_y, getActualProblemSize()); + checksum[vid].at(tune_idx) += calcChecksum(m_y, getActualProblemSize()); } -void DAXPY_ATOMIC::tearDown(VariantID vid, size_t /*tid*/) +void DAXPY_ATOMIC::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_x); diff --git a/src/basic/DAXPY_ATOMIC.hpp b/src/basic/DAXPY_ATOMIC.hpp index d56ca577b..4ea51cb48 100644 --- a/src/basic/DAXPY_ATOMIC.hpp +++ b/src/basic/DAXPY_ATOMIC.hpp @@ -46,15 +46,15 @@ class DAXPY_ATOMIC : public KernelBase ~DAXPY_ATOMIC(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/basic/IF_QUAD-Cuda.cpp b/src/basic/IF_QUAD-Cuda.cpp index 335987645..d2724913d 100644 --- a/src/basic/IF_QUAD-Cuda.cpp +++ b/src/basic/IF_QUAD-Cuda.cpp @@ -117,13 +117,13 @@ void IF_QUAD::runCudaVariantImpl(VariantID vid) } } -void IF_QUAD::runCudaVariant(VariantID vid, size_t tid) +void IF_QUAD::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/basic/IF_QUAD-Hip.cpp b/src/basic/IF_QUAD-Hip.cpp index 142cffc3e..1d0015a17 100644 --- a/src/basic/IF_QUAD-Hip.cpp +++ b/src/basic/IF_QUAD-Hip.cpp @@ -120,13 +120,13 @@ void IF_QUAD::runHipVariantImpl(VariantID vid) } } -void IF_QUAD::runHipVariant(VariantID vid, size_t tid) +void IF_QUAD::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/basic/IF_QUAD-OMP.cpp b/src/basic/IF_QUAD-OMP.cpp index bab43d4a2..d41fb3553 100644 --- a/src/basic/IF_QUAD-OMP.cpp +++ b/src/basic/IF_QUAD-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void IF_QUAD::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void IF_QUAD::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/IF_QUAD-OMPTarget.cpp b/src/basic/IF_QUAD-OMPTarget.cpp index 7e4b878f6..a75c8ae1f 100644 --- a/src/basic/IF_QUAD-OMPTarget.cpp +++ b/src/basic/IF_QUAD-OMPTarget.cpp @@ -45,7 +45,7 @@ namespace basic deallocOpenMPDeviceData(x1, did); \ deallocOpenMPDeviceData(x2, did); -void IF_QUAD::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void IF_QUAD::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/IF_QUAD-Seq.cpp b/src/basic/IF_QUAD-Seq.cpp index 186af6356..1bb733bc9 100644 --- a/src/basic/IF_QUAD-Seq.cpp +++ b/src/basic/IF_QUAD-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void IF_QUAD::runSeqVariant(VariantID vid, size_t /*tid*/) +void IF_QUAD::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp index dcb74bca5..789881577 100644 --- a/src/basic/IF_QUAD.cpp +++ b/src/basic/IF_QUAD.cpp @@ -61,7 +61,7 @@ IF_QUAD::~IF_QUAD() { } -void IF_QUAD::setUp(VariantID vid, size_t /*tid*/) +void IF_QUAD::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataRandSign(m_a, getActualProblemSize(), vid); allocAndInitData(m_b, getActualProblemSize(), vid); @@ -70,13 +70,13 @@ void IF_QUAD::setUp(VariantID vid, size_t /*tid*/) allocAndInitDataConst(m_x2, getActualProblemSize(), 0.0, vid); } -void IF_QUAD::updateChecksum(VariantID vid, size_t tid) +void IF_QUAD::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_x1, getActualProblemSize(), checksum_scale_factor ); - checksum[vid][tid] += calcChecksum(m_x2, getActualProblemSize(), checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_x1, getActualProblemSize(), checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_x2, getActualProblemSize(), checksum_scale_factor ); } -void IF_QUAD::tearDown(VariantID vid, size_t /*tid*/) +void IF_QUAD::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_a); diff --git a/src/basic/IF_QUAD.hpp b/src/basic/IF_QUAD.hpp index ad5a571ba..eb949b510 100644 --- a/src/basic/IF_QUAD.hpp +++ b/src/basic/IF_QUAD.hpp @@ -60,15 +60,15 @@ class IF_QUAD : public KernelBase ~IF_QUAD(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/basic/INIT3-Cuda.cpp b/src/basic/INIT3-Cuda.cpp index 1a5c9b0f3..3faa015fb 100644 --- a/src/basic/INIT3-Cuda.cpp +++ b/src/basic/INIT3-Cuda.cpp @@ -119,13 +119,13 @@ void INIT3::runCudaVariantImpl(VariantID vid) } } -void INIT3::runCudaVariant(VariantID vid, size_t tid) +void INIT3::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/basic/INIT3-Hip.cpp b/src/basic/INIT3-Hip.cpp index d6e80dfcc..2a575e9ec 100644 --- a/src/basic/INIT3-Hip.cpp +++ b/src/basic/INIT3-Hip.cpp @@ -121,13 +121,13 @@ void INIT3::runHipVariantImpl(VariantID vid) } } -void INIT3::runHipVariant(VariantID vid, size_t tid) +void INIT3::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/basic/INIT3-OMP.cpp b/src/basic/INIT3-OMP.cpp index f8daa6169..70cd4df1e 100644 --- a/src/basic/INIT3-OMP.cpp +++ b/src/basic/INIT3-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void INIT3::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void INIT3::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/INIT3-OMPTarget.cpp b/src/basic/INIT3-OMPTarget.cpp index b82544965..10271a859 100644 --- a/src/basic/INIT3-OMPTarget.cpp +++ b/src/basic/INIT3-OMPTarget.cpp @@ -47,7 +47,7 @@ namespace basic deallocOpenMPDeviceData(in2, did); -void INIT3::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void INIT3::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/INIT3-Seq.cpp b/src/basic/INIT3-Seq.cpp index 40afdfeec..429997569 100644 --- a/src/basic/INIT3-Seq.cpp +++ b/src/basic/INIT3-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void INIT3::runSeqVariant(VariantID vid, size_t /*tid*/) +void INIT3::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp index ac58d5b8f..567ded16f 100644 --- a/src/basic/INIT3.cpp +++ b/src/basic/INIT3.cpp @@ -57,7 +57,7 @@ INIT3::~INIT3() { } -void INIT3::setUp(VariantID vid, size_t /*tid*/) +void INIT3::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_out1, getActualProblemSize(), 0.0, vid); allocAndInitDataConst(m_out2, getActualProblemSize(), 0.0, vid); @@ -66,14 +66,14 @@ void INIT3::setUp(VariantID vid, size_t /*tid*/) allocAndInitData(m_in2, getActualProblemSize(), vid); } -void INIT3::updateChecksum(VariantID vid, size_t tid) +void INIT3::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_out1, getActualProblemSize()); - checksum[vid][tid] += calcChecksum(m_out2, getActualProblemSize()); - checksum[vid][tid] += calcChecksum(m_out3, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_out1, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_out2, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_out3, getActualProblemSize()); } -void INIT3::tearDown(VariantID vid, size_t /*tid*/) +void INIT3::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_out1); diff --git a/src/basic/INIT3.hpp b/src/basic/INIT3.hpp index 56caab7e4..a53f105a9 100644 --- a/src/basic/INIT3.hpp +++ b/src/basic/INIT3.hpp @@ -46,15 +46,15 @@ class INIT3 : public KernelBase ~INIT3(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/basic/INIT_VIEW1D-Cuda.cpp b/src/basic/INIT_VIEW1D-Cuda.cpp index 2ec82665c..6e8665ea2 100644 --- a/src/basic/INIT_VIEW1D-Cuda.cpp +++ b/src/basic/INIT_VIEW1D-Cuda.cpp @@ -110,13 +110,13 @@ void INIT_VIEW1D::runCudaVariantImpl(VariantID vid) } } -void INIT_VIEW1D::runCudaVariant(VariantID vid, size_t tid) +void INIT_VIEW1D::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/basic/INIT_VIEW1D-Hip.cpp b/src/basic/INIT_VIEW1D-Hip.cpp index e3f37e6a3..ea46befeb 100644 --- a/src/basic/INIT_VIEW1D-Hip.cpp +++ b/src/basic/INIT_VIEW1D-Hip.cpp @@ -113,13 +113,13 @@ void INIT_VIEW1D::runHipVariantImpl(VariantID vid) } } -void INIT_VIEW1D::runHipVariant(VariantID vid, size_t tid) +void INIT_VIEW1D::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/basic/INIT_VIEW1D-OMP.cpp b/src/basic/INIT_VIEW1D-OMP.cpp index ffcfe131e..b09cd2656 100644 --- a/src/basic/INIT_VIEW1D-OMP.cpp +++ b/src/basic/INIT_VIEW1D-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void INIT_VIEW1D::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void INIT_VIEW1D::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/INIT_VIEW1D-OMPTarget.cpp b/src/basic/INIT_VIEW1D-OMPTarget.cpp index 19f161f88..bedd30737 100644 --- a/src/basic/INIT_VIEW1D-OMPTarget.cpp +++ b/src/basic/INIT_VIEW1D-OMPTarget.cpp @@ -37,7 +37,7 @@ namespace basic deallocOpenMPDeviceData(a, did); -void INIT_VIEW1D::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void INIT_VIEW1D::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/INIT_VIEW1D-Seq.cpp b/src/basic/INIT_VIEW1D-Seq.cpp index ac1e4c868..483786c8f 100644 --- a/src/basic/INIT_VIEW1D-Seq.cpp +++ b/src/basic/INIT_VIEW1D-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void INIT_VIEW1D::runSeqVariant(VariantID vid, size_t /*tid*/) +void INIT_VIEW1D::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp index ab63fbd71..49545a622 100644 --- a/src/basic/INIT_VIEW1D.cpp +++ b/src/basic/INIT_VIEW1D.cpp @@ -58,18 +58,18 @@ INIT_VIEW1D::~INIT_VIEW1D() { } -void INIT_VIEW1D::setUp(VariantID vid, size_t /*tid*/) +void INIT_VIEW1D::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_a, getActualProblemSize(), 0.0, vid); m_val = 0.00000123; } -void INIT_VIEW1D::updateChecksum(VariantID vid, size_t tid) +void INIT_VIEW1D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_a, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_a, getActualProblemSize()); } -void INIT_VIEW1D::tearDown(VariantID vid, size_t /*tid*/) +void INIT_VIEW1D::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_a); diff --git a/src/basic/INIT_VIEW1D.hpp b/src/basic/INIT_VIEW1D.hpp index 937641a3a..00b9653ab 100644 --- a/src/basic/INIT_VIEW1D.hpp +++ b/src/basic/INIT_VIEW1D.hpp @@ -57,15 +57,15 @@ class INIT_VIEW1D : public KernelBase ~INIT_VIEW1D(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp index 59e8d9e71..c64c1f391 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp @@ -113,13 +113,13 @@ void INIT_VIEW1D_OFFSET::runCudaVariantImpl(VariantID vid) } } -void INIT_VIEW1D_OFFSET::runCudaVariant(VariantID vid, size_t tid) +void INIT_VIEW1D_OFFSET::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp index 91af9cadb..299c293e6 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp @@ -114,13 +114,13 @@ void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid) } } -void INIT_VIEW1D_OFFSET::runHipVariant(VariantID vid, size_t tid) +void INIT_VIEW1D_OFFSET::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp b/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp index d32832f86..783cfbcd1 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void INIT_VIEW1D_OFFSET::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void INIT_VIEW1D_OFFSET::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp index 43b2587d1..75663f3bd 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp @@ -37,7 +37,7 @@ namespace basic deallocOpenMPDeviceData(a, did); -void INIT_VIEW1D_OFFSET::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void INIT_VIEW1D_OFFSET::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; diff --git a/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp b/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp index 5731c30c0..435b54f3a 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void INIT_VIEW1D_OFFSET::runSeqVariant(VariantID vid, size_t /*tid*/) +void INIT_VIEW1D_OFFSET::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp index 7e79d7449..ae880290e 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET.cpp @@ -58,18 +58,18 @@ INIT_VIEW1D_OFFSET::~INIT_VIEW1D_OFFSET() { } -void INIT_VIEW1D_OFFSET::setUp(VariantID vid, size_t /*tid*/) +void INIT_VIEW1D_OFFSET::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_a, getActualProblemSize(), 0.0, vid); m_val = 0.00000123; } -void INIT_VIEW1D_OFFSET::updateChecksum(VariantID vid, size_t tid) +void INIT_VIEW1D_OFFSET::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_a, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_a, getActualProblemSize()); } -void INIT_VIEW1D_OFFSET::tearDown(VariantID vid, size_t /*tid*/) +void INIT_VIEW1D_OFFSET::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_a); diff --git a/src/basic/INIT_VIEW1D_OFFSET.hpp b/src/basic/INIT_VIEW1D_OFFSET.hpp index 82e0a15e1..e7af48895 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.hpp +++ b/src/basic/INIT_VIEW1D_OFFSET.hpp @@ -56,15 +56,15 @@ class INIT_VIEW1D_OFFSET : public KernelBase ~INIT_VIEW1D_OFFSET(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/basic/MAT_MAT_SHARED-Cuda.cpp b/src/basic/MAT_MAT_SHARED-Cuda.cpp index 221b3f400..84fdb9a76 100644 --- a/src/basic/MAT_MAT_SHARED-Cuda.cpp +++ b/src/basic/MAT_MAT_SHARED-Cuda.cpp @@ -305,13 +305,13 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) } } -void MAT_MAT_SHARED::runCudaVariant(VariantID vid, size_t tid) +void MAT_MAT_SHARED::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/basic/MAT_MAT_SHARED-Hip.cpp b/src/basic/MAT_MAT_SHARED-Hip.cpp index 4f82e3fb5..efddcab60 100644 --- a/src/basic/MAT_MAT_SHARED-Hip.cpp +++ b/src/basic/MAT_MAT_SHARED-Hip.cpp @@ -308,13 +308,13 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) } } -void MAT_MAT_SHARED::runHipVariant(VariantID vid, size_t tid) +void MAT_MAT_SHARED::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/basic/MAT_MAT_SHARED-OMP.cpp b/src/basic/MAT_MAT_SHARED-OMP.cpp index e9d232f37..7bded7169 100644 --- a/src/basic/MAT_MAT_SHARED-OMP.cpp +++ b/src/basic/MAT_MAT_SHARED-OMP.cpp @@ -15,7 +15,7 @@ namespace rajaperf { namespace basic { -void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid, size_t /*tid*/) { +void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) const Index_type run_reps = getRunReps(); diff --git a/src/basic/MAT_MAT_SHARED-OMPTarget.cpp b/src/basic/MAT_MAT_SHARED-OMPTarget.cpp index 9b65dc204..fdaab0d00 100644 --- a/src/basic/MAT_MAT_SHARED-OMPTarget.cpp +++ b/src/basic/MAT_MAT_SHARED-OMPTarget.cpp @@ -20,7 +20,7 @@ namespace rajaperf { namespace basic { - void MAT_MAT_SHARED::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) { + void MAT_MAT_SHARED::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); switch (vid) { diff --git a/src/basic/MAT_MAT_SHARED-Seq.cpp b/src/basic/MAT_MAT_SHARED-Seq.cpp index dd46ea46d..039def8c7 100644 --- a/src/basic/MAT_MAT_SHARED-Seq.cpp +++ b/src/basic/MAT_MAT_SHARED-Seq.cpp @@ -13,7 +13,7 @@ namespace rajaperf { namespace basic { -void MAT_MAT_SHARED::runSeqVariant(VariantID vid, size_t /*tid*/) { +void MAT_MAT_SHARED::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type N = m_N; diff --git a/src/basic/MAT_MAT_SHARED.cpp b/src/basic/MAT_MAT_SHARED.cpp index fa92bdb13..09be79506 100644 --- a/src/basic/MAT_MAT_SHARED.cpp +++ b/src/basic/MAT_MAT_SHARED.cpp @@ -64,7 +64,7 @@ MAT_MAT_SHARED::MAT_MAT_SHARED(const RunParams ¶ms) MAT_MAT_SHARED::~MAT_MAT_SHARED() {} -void MAT_MAT_SHARED::setUp(VariantID vid, size_t /*tid*/) { +void MAT_MAT_SHARED::setUp(VariantID vid, size_t /*tune_idx*/) { const Index_type NN = m_N * m_N; allocAndInitDataConst(m_A, NN, 1.0, vid); @@ -72,11 +72,11 @@ void MAT_MAT_SHARED::setUp(VariantID vid, size_t /*tid*/) { allocAndInitDataConst(m_C, NN, 0.0, vid); } -void MAT_MAT_SHARED::updateChecksum(VariantID vid, size_t tid) { - checksum[vid][tid] += calcChecksum(m_C, m_N*m_N, checksum_scale_factor ); +void MAT_MAT_SHARED::updateChecksum(VariantID vid, size_t tune_idx) { + checksum[vid][tune_idx] += calcChecksum(m_C, m_N*m_N, checksum_scale_factor ); } -void MAT_MAT_SHARED::tearDown(VariantID vid, size_t /*tid*/) { +void MAT_MAT_SHARED::tearDown(VariantID vid, size_t /*tune_idx*/) { (void)vid; deallocData(m_A); deallocData(m_B); diff --git a/src/basic/MAT_MAT_SHARED.hpp b/src/basic/MAT_MAT_SHARED.hpp index b689113dc..772de050c 100644 --- a/src/basic/MAT_MAT_SHARED.hpp +++ b/src/basic/MAT_MAT_SHARED.hpp @@ -146,15 +146,15 @@ class MAT_MAT_SHARED : public KernelBase { ~MAT_MAT_SHARED(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/basic/MULADDSUB-Cuda.cpp b/src/basic/MULADDSUB-Cuda.cpp index 2c2c90009..a61585806 100644 --- a/src/basic/MULADDSUB-Cuda.cpp +++ b/src/basic/MULADDSUB-Cuda.cpp @@ -119,13 +119,13 @@ void MULADDSUB::runCudaVariantImpl(VariantID vid) } } -void MULADDSUB::runCudaVariant(VariantID vid, size_t tid) +void MULADDSUB::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/basic/MULADDSUB-Hip.cpp b/src/basic/MULADDSUB-Hip.cpp index 6b0dc24ee..4cc0bbfa7 100644 --- a/src/basic/MULADDSUB-Hip.cpp +++ b/src/basic/MULADDSUB-Hip.cpp @@ -121,13 +121,13 @@ void MULADDSUB::runHipVariantImpl(VariantID vid) } } -void MULADDSUB::runHipVariant(VariantID vid, size_t tid) +void MULADDSUB::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/basic/MULADDSUB-OMP.cpp b/src/basic/MULADDSUB-OMP.cpp index 53ec11408..313c07d2c 100644 --- a/src/basic/MULADDSUB-OMP.cpp +++ b/src/basic/MULADDSUB-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void MULADDSUB::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void MULADDSUB::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/MULADDSUB-OMPTarget.cpp b/src/basic/MULADDSUB-OMPTarget.cpp index 84eb482c7..cf4909a26 100644 --- a/src/basic/MULADDSUB-OMPTarget.cpp +++ b/src/basic/MULADDSUB-OMPTarget.cpp @@ -47,7 +47,7 @@ namespace basic deallocOpenMPDeviceData(in2, did); -void MULADDSUB::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void MULADDSUB::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/MULADDSUB-Seq.cpp b/src/basic/MULADDSUB-Seq.cpp index 0e896ffdf..d67809ffb 100644 --- a/src/basic/MULADDSUB-Seq.cpp +++ b/src/basic/MULADDSUB-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void MULADDSUB::runSeqVariant(VariantID vid, size_t /*tid*/) +void MULADDSUB::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp index ef5348b66..8aca466b5 100644 --- a/src/basic/MULADDSUB.cpp +++ b/src/basic/MULADDSUB.cpp @@ -57,7 +57,7 @@ MULADDSUB::~MULADDSUB() { } -void MULADDSUB::setUp(VariantID vid, size_t /*tid*/) +void MULADDSUB::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_out1, getActualProblemSize(), 0.0, vid); allocAndInitDataConst(m_out2, getActualProblemSize(), 0.0, vid); @@ -66,14 +66,14 @@ void MULADDSUB::setUp(VariantID vid, size_t /*tid*/) allocAndInitData(m_in2, getActualProblemSize(), vid); } -void MULADDSUB::updateChecksum(VariantID vid, size_t tid) +void MULADDSUB::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_out1, getActualProblemSize()); - checksum[vid][tid] += calcChecksum(m_out2, getActualProblemSize()); - checksum[vid][tid] += calcChecksum(m_out3, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_out1, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_out2, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_out3, getActualProblemSize()); } -void MULADDSUB::tearDown(VariantID vid, size_t /*tid*/) +void MULADDSUB::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_out1); diff --git a/src/basic/MULADDSUB.hpp b/src/basic/MULADDSUB.hpp index 320c1925e..5cf36f996 100644 --- a/src/basic/MULADDSUB.hpp +++ b/src/basic/MULADDSUB.hpp @@ -49,15 +49,15 @@ class MULADDSUB : public KernelBase ~MULADDSUB(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/basic/NESTED_INIT-Cuda.cpp b/src/basic/NESTED_INIT-Cuda.cpp index dac0f4635..a51c0d563 100644 --- a/src/basic/NESTED_INIT-Cuda.cpp +++ b/src/basic/NESTED_INIT-Cuda.cpp @@ -172,13 +172,13 @@ void NESTED_INIT::runCudaVariantImpl(VariantID vid) } } -void NESTED_INIT::runCudaVariant(VariantID vid, size_t tid) +void NESTED_INIT::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/basic/NESTED_INIT-Hip.cpp b/src/basic/NESTED_INIT-Hip.cpp index ebf5c66fd..9667a0622 100644 --- a/src/basic/NESTED_INIT-Hip.cpp +++ b/src/basic/NESTED_INIT-Hip.cpp @@ -174,13 +174,13 @@ void NESTED_INIT::runHipVariantImpl(VariantID vid) } } -void NESTED_INIT::runHipVariant(VariantID vid, size_t tid) +void NESTED_INIT::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/basic/NESTED_INIT-OMP.cpp b/src/basic/NESTED_INIT-OMP.cpp index 6588d3a85..4819bb69d 100644 --- a/src/basic/NESTED_INIT-OMP.cpp +++ b/src/basic/NESTED_INIT-OMP.cpp @@ -21,7 +21,7 @@ namespace basic #undef USE_OMP_COLLAPSE -void NESTED_INIT::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void NESTED_INIT::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/NESTED_INIT-OMPTarget.cpp b/src/basic/NESTED_INIT-OMPTarget.cpp index c025e1151..3ca91e5fc 100644 --- a/src/basic/NESTED_INIT-OMPTarget.cpp +++ b/src/basic/NESTED_INIT-OMPTarget.cpp @@ -32,7 +32,7 @@ namespace basic deallocOpenMPDeviceData(array, did); -void NESTED_INIT::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void NESTED_INIT::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/basic/NESTED_INIT-Seq.cpp b/src/basic/NESTED_INIT-Seq.cpp index 032d11fcc..57a948c64 100644 --- a/src/basic/NESTED_INIT-Seq.cpp +++ b/src/basic/NESTED_INIT-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void NESTED_INIT::runSeqVariant(VariantID vid, size_t /*tid*/) +void NESTED_INIT::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp index 759f3a58c..9604e39ac 100644 --- a/src/basic/NESTED_INIT.cpp +++ b/src/basic/NESTED_INIT.cpp @@ -68,17 +68,17 @@ NESTED_INIT::~NESTED_INIT() { } -void NESTED_INIT::setUp(VariantID vid, size_t /*tid*/) +void NESTED_INIT::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_array, m_array_length, 0.0, vid); } -void NESTED_INIT::updateChecksum(VariantID vid, size_t tid) +void NESTED_INIT::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_array, m_array_length); + checksum[vid][tune_idx] += calcChecksum(m_array, m_array_length); } -void NESTED_INIT::tearDown(VariantID vid, size_t /*tid*/) +void NESTED_INIT::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; RAJA::free_aligned(m_array); diff --git a/src/basic/NESTED_INIT.hpp b/src/basic/NESTED_INIT.hpp index 9e7f5d24b..a2213abcc 100644 --- a/src/basic/NESTED_INIT.hpp +++ b/src/basic/NESTED_INIT.hpp @@ -49,15 +49,15 @@ class NESTED_INIT : public KernelBase ~NESTED_INIT(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/basic/PI_ATOMIC-Cuda.cpp b/src/basic/PI_ATOMIC-Cuda.cpp index 71a392694..fa619d246 100644 --- a/src/basic/PI_ATOMIC-Cuda.cpp +++ b/src/basic/PI_ATOMIC-Cuda.cpp @@ -125,13 +125,13 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) } } -void PI_ATOMIC::runCudaVariant(VariantID vid, size_t tid) +void PI_ATOMIC::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp index f2b753b55..85f08df66 100644 --- a/src/basic/PI_ATOMIC-Hip.cpp +++ b/src/basic/PI_ATOMIC-Hip.cpp @@ -127,13 +127,13 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) } } -void PI_ATOMIC::runHipVariant(VariantID vid, size_t tid) +void PI_ATOMIC::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/basic/PI_ATOMIC-OMP.cpp b/src/basic/PI_ATOMIC-OMP.cpp index aca1dd297..75e6cb493 100644 --- a/src/basic/PI_ATOMIC-OMP.cpp +++ b/src/basic/PI_ATOMIC-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void PI_ATOMIC::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void PI_ATOMIC::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/PI_ATOMIC-OMPTarget.cpp b/src/basic/PI_ATOMIC-OMPTarget.cpp index 04511fbfd..e395d46d9 100644 --- a/src/basic/PI_ATOMIC-OMPTarget.cpp +++ b/src/basic/PI_ATOMIC-OMPTarget.cpp @@ -36,7 +36,7 @@ namespace basic deallocOpenMPDeviceData(pi, did); -void PI_ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void PI_ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/PI_ATOMIC-Seq.cpp b/src/basic/PI_ATOMIC-Seq.cpp index f8e1bbf33..b92bf8c54 100644 --- a/src/basic/PI_ATOMIC-Seq.cpp +++ b/src/basic/PI_ATOMIC-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void PI_ATOMIC::runSeqVariant(VariantID vid, size_t /*tid*/) +void PI_ATOMIC::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp index 63e8cee35..005a40ccd 100644 --- a/src/basic/PI_ATOMIC.cpp +++ b/src/basic/PI_ATOMIC.cpp @@ -59,19 +59,19 @@ PI_ATOMIC::~PI_ATOMIC() { } -void PI_ATOMIC::setUp(VariantID vid, size_t /*tid*/) +void PI_ATOMIC::setUp(VariantID vid, size_t /*tune_idx*/) { m_dx = 1.0 / double(getActualProblemSize()); allocAndInitDataConst(m_pi, 1, 0.0, vid); m_pi_init = 0.0; } -void PI_ATOMIC::updateChecksum(VariantID vid, size_t tid) +void PI_ATOMIC::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += Checksum_type(*m_pi); + checksum[vid][tune_idx] += Checksum_type(*m_pi); } -void PI_ATOMIC::tearDown(VariantID vid, size_t /*tid*/) +void PI_ATOMIC::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_pi); diff --git a/src/basic/PI_ATOMIC.hpp b/src/basic/PI_ATOMIC.hpp index ed3e6a38d..67c6a29c2 100644 --- a/src/basic/PI_ATOMIC.hpp +++ b/src/basic/PI_ATOMIC.hpp @@ -45,15 +45,15 @@ class PI_ATOMIC : public KernelBase ~PI_ATOMIC(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index 9fea6d8be..28c0c470f 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -117,13 +117,13 @@ void PI_REDUCE::runCudaVariantImpl(VariantID vid) } } -void PI_REDUCE::runCudaVariant(VariantID vid, size_t tid) +void PI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index 7a1eb57ed..908517140 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -116,13 +116,13 @@ void PI_REDUCE::runHipVariantImpl(VariantID vid) } } -void PI_REDUCE::runHipVariant(VariantID vid, size_t tid) +void PI_REDUCE::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/basic/PI_REDUCE-OMP.cpp b/src/basic/PI_REDUCE-OMP.cpp index e16a8589c..cefa9dc9f 100644 --- a/src/basic/PI_REDUCE-OMP.cpp +++ b/src/basic/PI_REDUCE-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void PI_REDUCE::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void PI_REDUCE::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/PI_REDUCE-OMPTarget.cpp b/src/basic/PI_REDUCE-OMPTarget.cpp index 445e5caa7..f1740d159 100644 --- a/src/basic/PI_REDUCE-OMPTarget.cpp +++ b/src/basic/PI_REDUCE-OMPTarget.cpp @@ -27,7 +27,7 @@ namespace basic const size_t threads_per_team = 256; -void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/PI_REDUCE-Seq.cpp b/src/basic/PI_REDUCE-Seq.cpp index 66cecec70..58cb06e80 100644 --- a/src/basic/PI_REDUCE-Seq.cpp +++ b/src/basic/PI_REDUCE-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void PI_REDUCE::runSeqVariant(VariantID vid, size_t /*tid*/) +void PI_REDUCE::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/PI_REDUCE.cpp b/src/basic/PI_REDUCE.cpp index ffbd8ab7e..c093d16cb 100644 --- a/src/basic/PI_REDUCE.cpp +++ b/src/basic/PI_REDUCE.cpp @@ -57,7 +57,7 @@ PI_REDUCE::~PI_REDUCE() { } -void PI_REDUCE::setUp(VariantID vid, size_t /*tid*/) +void PI_REDUCE::setUp(VariantID vid, size_t /*tune_idx*/) { (void) vid; m_dx = 1.0 / double(getActualProblemSize()); @@ -65,12 +65,12 @@ void PI_REDUCE::setUp(VariantID vid, size_t /*tid*/) m_pi = 0.0; } -void PI_REDUCE::updateChecksum(VariantID vid, size_t tid) +void PI_REDUCE::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += Checksum_type(m_pi); + checksum[vid][tune_idx] += Checksum_type(m_pi); } -void PI_REDUCE::tearDown(VariantID vid, size_t /*tid*/) +void PI_REDUCE::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; } diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp index 2805f3c03..901b9959d 100644 --- a/src/basic/PI_REDUCE.hpp +++ b/src/basic/PI_REDUCE.hpp @@ -47,15 +47,15 @@ class PI_REDUCE : public KernelBase ~PI_REDUCE(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index 78d837ffe..8940eb405 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -161,13 +161,13 @@ void REDUCE3_INT::runCudaVariantImpl(VariantID vid) } } -void REDUCE3_INT::runCudaVariant(VariantID vid, size_t tid) +void REDUCE3_INT::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index a30cc1e25..9b730bbcd 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -161,13 +161,13 @@ void REDUCE3_INT::runHipVariantImpl(VariantID vid) } } -void REDUCE3_INT::runHipVariant(VariantID vid, size_t tid) +void REDUCE3_INT::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/basic/REDUCE3_INT-OMP.cpp b/src/basic/REDUCE3_INT-OMP.cpp index fafe7ba09..01c26977d 100644 --- a/src/basic/REDUCE3_INT-OMP.cpp +++ b/src/basic/REDUCE3_INT-OMP.cpp @@ -19,7 +19,7 @@ namespace basic { -void REDUCE3_INT::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void REDUCE3_INT::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/REDUCE3_INT-OMPTarget.cpp b/src/basic/REDUCE3_INT-OMPTarget.cpp index b83ad9db0..98efe668a 100644 --- a/src/basic/REDUCE3_INT-OMPTarget.cpp +++ b/src/basic/REDUCE3_INT-OMPTarget.cpp @@ -36,7 +36,7 @@ namespace basic deallocOpenMPDeviceData(vec, did); \ -void REDUCE3_INT::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void REDUCE3_INT::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/REDUCE3_INT-Seq.cpp b/src/basic/REDUCE3_INT-Seq.cpp index 137dc6b1c..3ad721f04 100644 --- a/src/basic/REDUCE3_INT-Seq.cpp +++ b/src/basic/REDUCE3_INT-Seq.cpp @@ -19,7 +19,7 @@ namespace basic { -void REDUCE3_INT::runSeqVariant(VariantID vid, size_t /*tid*/) +void REDUCE3_INT::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp index b99e31fe3..14e47e201 100644 --- a/src/basic/REDUCE3_INT.cpp +++ b/src/basic/REDUCE3_INT.cpp @@ -62,7 +62,7 @@ REDUCE3_INT::~REDUCE3_INT() { } -void REDUCE3_INT::setUp(VariantID vid, size_t /*tid*/) +void REDUCE3_INT::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitData(m_vec, getActualProblemSize(), vid); @@ -74,14 +74,14 @@ void REDUCE3_INT::setUp(VariantID vid, size_t /*tid*/) m_vmax_init = std::numeric_limits::min(); } -void REDUCE3_INT::updateChecksum(VariantID vid, size_t tid) +void REDUCE3_INT::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += m_vsum; - checksum[vid][tid] += m_vmin; - checksum[vid][tid] += m_vmax; + checksum[vid][tune_idx] += m_vsum; + checksum[vid][tune_idx] += m_vmin; + checksum[vid][tune_idx] += m_vmax; } -void REDUCE3_INT::tearDown(VariantID vid, size_t /*tid*/) +void REDUCE3_INT::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_vec); diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp index f3ad7e6f2..1f13e457b 100644 --- a/src/basic/REDUCE3_INT.hpp +++ b/src/basic/REDUCE3_INT.hpp @@ -61,15 +61,15 @@ class REDUCE3_INT : public KernelBase ~REDUCE3_INT(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index a11122367..4d3d74012 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -151,13 +151,13 @@ void TRAP_INT::runCudaVariantImpl(VariantID vid) } } -void TRAP_INT::runCudaVariant(VariantID vid, size_t tid) +void TRAP_INT::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index b1401e5e4..506028dbd 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -150,13 +150,13 @@ void TRAP_INT::runHipVariantImpl(VariantID vid) } } -void TRAP_INT::runHipVariant(VariantID vid, size_t tid) +void TRAP_INT::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/basic/TRAP_INT-OMP.cpp b/src/basic/TRAP_INT-OMP.cpp index 99ceef91b..b18beff6c 100644 --- a/src/basic/TRAP_INT-OMP.cpp +++ b/src/basic/TRAP_INT-OMP.cpp @@ -32,7 +32,7 @@ Real_type trap_int_func(Real_type x, } -void TRAP_INT::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void TRAP_INT::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/TRAP_INT-OMPTarget.cpp b/src/basic/TRAP_INT-OMPTarget.cpp index 21948bbd9..78b57c1db 100644 --- a/src/basic/TRAP_INT-OMPTarget.cpp +++ b/src/basic/TRAP_INT-OMPTarget.cpp @@ -46,7 +46,7 @@ Real_type trap_int_func(Real_type x, #define TRAP_INT_DATA_TEARDOWN_OMP_TARGET // nothing to do here... -void TRAP_INT::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void TRAP_INT::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/TRAP_INT-Seq.cpp b/src/basic/TRAP_INT-Seq.cpp index 6312dc6e2..b8af73291 100644 --- a/src/basic/TRAP_INT-Seq.cpp +++ b/src/basic/TRAP_INT-Seq.cpp @@ -32,7 +32,7 @@ Real_type trap_int_func(Real_type x, } -void TRAP_INT::runSeqVariant(VariantID vid, size_t /*tid*/) +void TRAP_INT::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp index e01134eef..8e5612c19 100644 --- a/src/basic/TRAP_INT.cpp +++ b/src/basic/TRAP_INT.cpp @@ -57,7 +57,7 @@ TRAP_INT::~TRAP_INT() { } -void TRAP_INT::setUp(VariantID vid, size_t /*tid*/) +void TRAP_INT::setUp(VariantID vid, size_t /*tune_idx*/) { Real_type xn; initData(xn, vid); @@ -74,12 +74,12 @@ void TRAP_INT::setUp(VariantID vid, size_t /*tid*/) m_sumx = 0; } -void TRAP_INT::updateChecksum(VariantID vid, size_t tid) +void TRAP_INT::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += m_sumx; + checksum[vid][tune_idx] += m_sumx; } -void TRAP_INT::tearDown(VariantID vid, size_t /*tid*/) +void TRAP_INT::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; } diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp index d62dfad5a..f2e714f09 100644 --- a/src/basic/TRAP_INT.hpp +++ b/src/basic/TRAP_INT.hpp @@ -58,15 +58,15 @@ class TRAP_INT : public KernelBase ~TRAP_INT(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index bc600bd69..1adc7e439 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -43,7 +43,7 @@ using namespace std; Executor::Executor(int argc, char** argv) : run_params(argc, argv), reference_vid(NumVariants), - reference_tid(KernelBase::getUnknownTuningIdx()) + reference_tune_idx(KernelBase::getUnknownTuningIdx()) { } @@ -449,7 +449,7 @@ void Executor::setupSuite() run_var.insert( vid ); if ( getVariantName(vid) == run_params.getReferenceVariant() ) { reference_vid = vid; - reference_tid = 0; + reference_tune_idx = 0; } } } @@ -459,7 +459,7 @@ void Executor::setupSuite() // if ( run_params.getReferenceVariant().empty() && !run_var.empty() ) { reference_vid = *run_var.begin(); - reference_tid = 0; + reference_tune_idx = 0; } } else { @@ -487,7 +487,7 @@ void Executor::setupSuite() run_var.insert(vid); if ( getVariantName(vid) == run_params.getReferenceVariant() ) { reference_vid = vid; - reference_tid = 0; + reference_tune_idx = 0; } } found_it = true; @@ -502,7 +502,7 @@ void Executor::setupSuite() // if ( run_params.getReferenceVariant().empty() && !run_var.empty() ) { reference_vid = *run_var.begin(); - reference_tid = 0; + reference_tune_idx = 0; } run_params.setInvalidVariantInput(invalid); @@ -847,13 +847,13 @@ void Executor::runKernel(KernelBase* kernel, bool print_kernel_name) getCout() << getVariantName(vid) << " variant" << endl; } - for (size_t tid = 0; tid < kernel->getNumVariantTunings(vid); ++tid) { + for (size_t tune_idx = 0; tune_idx < kernel->getNumVariantTunings(vid); ++tune_idx) { if ( run_params.showProgress() ) { getCout() << " Running " - << kernel->getVariantTuningName(vid, tid) << " tuning"; + << kernel->getVariantTuningName(vid, tune_idx) << " tuning"; } - kernel->execute(vid, tid); + kernel->execute(vid, tune_idx); if ( run_params.showProgress() ) { getCout() << " -- " << kernel->getLastTime() << " sec." << endl; } @@ -1006,7 +1006,7 @@ void Executor::writeCSVReport(ostream& file, CSVRepMode mode, std::string const& tuning_name = tuning_names[variant_ids[iv]][it]; file << sepchr <hasVariantTuningDefined(reference_vid, reference_tid) || + (!kern->hasVariantTuningDefined(reference_vid, reference_tune_idx) || !kern->hasVariantTuningDefined(vid, tuning_name)) ) { file << "Not run"; } else if ( (mode == CSVRepMode::Timing) && @@ -1148,22 +1148,22 @@ void Executor::writeFOMReport(ostream& file, vector& fom_groups) for (const string& tuning_name : tuning_names[vid]) { - size_t tid = kern->getVariantTuningIndex(vid, tuning_name); + size_t tune_idx = kern->getVariantTuningIndex(vid, tuning_name); // // If kernel variant was run, generate data for it and // print (signed) percentage difference from baseline. // - if ( kern->wasVariantTuningRun(vid, tid) ) { + if ( kern->wasVariantTuningRun(vid, tune_idx) ) { col_exec_count[ifg][col]++; bool is_base = (base_totTime == unknown_totTime); if (is_base) { - base_totTime = kern->getTotTime(vid, tid); + base_totTime = kern->getTotTime(vid, tune_idx); } pct_diff[ik][ifg][col] = - (kern->getTotTime(vid, tid) - base_totTime) / base_totTime; + (kern->getTotTime(vid, tune_idx) - base_totTime) / base_totTime; string pfstring(pass); if (pct_diff[ik][ifg][col] > run_params.getPFTolerance()) { @@ -1230,9 +1230,9 @@ void Executor::writeFOMReport(ostream& file, vector& fom_groups) for (const string& tuning_name : tuning_names[vid]) { - size_t tid = kern->getVariantTuningIndex(vid, tuning_name); + size_t tune_idx = kern->getVariantTuningIndex(vid, tuning_name); - if ( kern->wasVariantTuningRun(vid, tid) ) { + if ( kern->wasVariantTuningRun(vid, tune_idx) ) { col_stddev[ifg][col] += ( pct_diff[ik][ifg][col] - col_avg[ifg][col] ) * ( pct_diff[ik][ifg][col] - col_avg[ifg][col] ); } @@ -1393,9 +1393,9 @@ void Executor::writeChecksumReport(ostream& file) while ( ivck < variant_ids.size() && !found_ref ) { VariantID vid = variant_ids[ivck]; size_t num_tunings = kern->getNumVariantTunings(vid); - for (size_t tid = 0; tid < num_tunings; ++tid) { - if ( kern->wasVariantTuningRun(vid, tid) ) { - cksum_ref = kern->getChecksum(vid, tid); + for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) { + if ( kern->wasVariantTuningRun(vid, tune_idx) ) { + cksum_ref = kern->getChecksum(vid, tune_idx); found_ref = true; break; } @@ -1412,10 +1412,10 @@ void Executor::writeChecksumReport(ostream& file) checksums[iv].resize(num_tunings, 0.0); checksums_diff[iv].resize(num_tunings, 0.0); - for (size_t tid = 0; tid < num_tunings; ++tid) { - if ( kern->wasVariantTuningRun(vid, tid) ) { - checksums[iv][tid] = kern->getChecksum(vid, tid); - checksums_diff[iv][tid] = cksum_ref - kern->getChecksum(vid, tid); + for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) { + if ( kern->wasVariantTuningRun(vid, tune_idx) ) { + checksums[iv][tune_idx] = kern->getChecksum(vid, tune_idx); + checksums_diff[iv][tune_idx] = cksum_ref - kern->getChecksum(vid, tune_idx); } } } @@ -1438,8 +1438,8 @@ void Executor::writeChecksumReport(ostream& file) for (size_t iv = 0; iv < variant_ids.size(); ++iv) { size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]); checksums_avg[iv].resize(num_tunings, 0.0); - for (size_t tid = 0; tid < num_tunings; ++tid) { - checksums_avg[iv][tid] = checksums_sum[iv][tid] / num_ranks; + for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) { + checksums_avg[iv][tune_idx] = checksums_sum[iv][tune_idx] / num_ranks; } } @@ -1448,8 +1448,8 @@ void Executor::writeChecksumReport(ostream& file) for (size_t iv = 0; iv < variant_ids.size(); ++iv) { size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]); checksums_abs_diff[iv].resize(num_tunings, 0.0); - for (size_t tid = 0; tid < num_tunings; ++tid) { - checksums_abs_diff[iv][tid] = std::abs(checksums_diff[iv][tid]); + for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) { + checksums_abs_diff[iv][tune_idx] = std::abs(checksums_diff[iv][tune_idx]); } } @@ -1474,8 +1474,8 @@ void Executor::writeChecksumReport(ostream& file) for (size_t iv = 0; iv < variant_ids.size(); ++iv) { size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]); checksums_abs_diff_avg[iv].resize(num_tunings, 0.0); - for (size_t tid = 0; tid < num_tunings; ++tid) { - checksums_abs_diff_avg[iv][tid] = checksums_abs_diff_sum[iv][tid] / num_ranks; + for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) { + checksums_abs_diff_avg[iv][tune_idx] = checksums_abs_diff_sum[iv][tune_idx] / num_ranks; } } @@ -1483,9 +1483,9 @@ void Executor::writeChecksumReport(ostream& file) for (size_t iv = 0; iv < variant_ids.size(); ++iv) { size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]); checksums_abs_diff_diff2avg2[iv].resize(num_tunings, 0.0); - for (size_t tid = 0; tid < num_tunings; ++tid) { - checksums_abs_diff_diff2avg2[iv][tid] = (checksums_abs_diff[iv][tid] - checksums_abs_diff_avg[iv][tid]) * - (checksums_abs_diff[iv][tid] - checksums_abs_diff_avg[iv][tid]) ; + for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) { + checksums_abs_diff_diff2avg2[iv][tune_idx] = (checksums_abs_diff[iv][tune_idx] - checksums_abs_diff_avg[iv][tune_idx]) * + (checksums_abs_diff[iv][tune_idx] - checksums_abs_diff_avg[iv][tune_idx]) ; } } @@ -1495,8 +1495,8 @@ void Executor::writeChecksumReport(ostream& file) checksums_abs_diff_stddev[iv].resize(num_tunings, 0.0); MPI_Allreduce(checksums_abs_diff_diff2avg2.data(), checksums_abs_diff_stddev.data(), num_tunings, Checksum_MPI_type, MPI_SUM, MPI_COMM_WORLD); - for (size_t tid = 0; tid < num_tunings; ++tid) { - checksums_abs_diff_stddev[iv][tid] = std::sqrt(checksums_abs_diff_stddev[iv][tid] / num_ranks); + for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) { + checksums_abs_diff_stddev[iv][tune_idx] = std::sqrt(checksums_abs_diff_stddev[iv][tune_idx] / num_ranks); } } @@ -1507,19 +1507,19 @@ void Executor::writeChecksumReport(ostream& file) const string& variant_name = getVariantName(vid); size_t num_tunings = kernels[ik]->getNumVariantTunings(variant_ids[iv]); - for (size_t tid = 0; tid < num_tunings; ++tid) { - const string& tuning_name = kern->getVariantTuningName(vid, tid); + for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) { + const string& tuning_name = kern->getVariantTuningName(vid, tune_idx); - if ( kern->wasVariantTuningRun(vid, tid) ) { + if ( kern->wasVariantTuningRun(vid, tune_idx) ) { file <getTotTime(vid, tid) / run_params.getNumPasses(); + retval = kern->getTotTime(vid, tune_idx) / run_params.getNumPasses(); } break; case RunParams::CombinerOpt::Minimum : { - retval = kern->getMinTime(vid, tid); + retval = kern->getMinTime(vid, tune_idx); } break; case RunParams::CombinerOpt::Maximum : { - retval = kern->getMaxTime(vid, tid); + retval = kern->getMaxTime(vid, tune_idx); } break; default : { cout << "\n Unknown CSV combiner mode = " << combiner << endl; } @@ -1610,22 +1610,22 @@ long double Executor::getReportDataEntry(CSVRepMode mode, } case CSVRepMode::Speedup : { if ( haveReferenceVariant() ) { - if ( kern->hasVariantTuningDefined(reference_vid, reference_tid) && - kern->hasVariantTuningDefined(vid, tid) ) { + if ( kern->hasVariantTuningDefined(reference_vid, reference_tune_idx) && + kern->hasVariantTuningDefined(vid, tune_idx) ) { switch ( combiner ) { case RunParams::CombinerOpt::Average : { - retval = kern->getTotTime(reference_vid, reference_tid) / - kern->getTotTime(vid, tid); + retval = kern->getTotTime(reference_vid, reference_tune_idx) / + kern->getTotTime(vid, tune_idx); } break; case RunParams::CombinerOpt::Minimum : { - retval = kern->getMinTime(reference_vid, reference_tid) / - kern->getMinTime(vid, tid); + retval = kern->getMinTime(reference_vid, reference_tune_idx) / + kern->getMinTime(vid, tune_idx); } break; case RunParams::CombinerOpt::Maximum : { - retval = kern->getMaxTime(reference_vid, reference_tid) / - kern->getMaxTime(vid, tid); + retval = kern->getMaxTime(reference_vid, reference_tune_idx) / + kern->getMaxTime(vid, tune_idx); } break; default : { cout << "\n Unknown CSV combiner mode = " << combiner << endl; } @@ -1635,10 +1635,10 @@ long double Executor::getReportDataEntry(CSVRepMode mode, } #if 0 // RDH DEBUG (leave this here, it's useful for debugging!) getCout() << "Kernel(iv): " << kern->getName() << "(" << vid << ")" - << "(" << tid << ")"endl; + << "(" << tune_idx << ")"endl; getCout() << "\tref_time, tot_time, retval = " - << kern->getTotTime(reference_vid, reference_tid) << " , " - << kern->getTotTime(vid, tid) << " , " + << kern->getTotTime(reference_vid, reference_tune_idx) << " , " + << kern->getTotTime(vid, tune_idx) << " , " << retval << endl; #endif } diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp index 393748288..8a3fe11e0 100644 --- a/src/common/Executor.hpp +++ b/src/common/Executor.hpp @@ -75,7 +75,7 @@ class Executor RunParams::CombinerOpt combiner, size_t prec); std::string getReportTitle(CSVRepMode mode, RunParams::CombinerOpt combiner); long double getReportDataEntry(CSVRepMode mode, RunParams::CombinerOpt combiner, - KernelBase* kern, VariantID vid, size_t tid); + KernelBase* kern, VariantID vid, size_t tune_idx); void writeChecksumReport(std::ostream& file); @@ -88,7 +88,7 @@ class Executor std::vector tuning_names[NumVariants]; VariantID reference_vid; - size_t reference_tid; + size_t reference_tune_idx; }; } // closing brace for rajaperf namespace diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index fcd40d09c..a07a6bbbb 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -145,21 +145,21 @@ void KernelBase::setVariantDefined(VariantID vid) tot_time[vid].resize(variant_tuning_names[vid].size(), 0.0); } -void KernelBase::execute(VariantID vid, size_t tid) +void KernelBase::execute(VariantID vid, size_t tune_idx) { running_variant = vid; - running_tuning = tid; + running_tuning = tune_idx; resetTimer(); resetDataInitCount(); - this->setUp(vid, tid); + this->setUp(vid, tune_idx); - this->runKernel(vid, tid); + this->runKernel(vid, tune_idx); - this->updateChecksum(vid, tid); + this->updateChecksum(vid, tune_idx); - this->tearDown(vid, tid); + this->tearDown(vid, tune_idx); running_variant = NumVariants; running_tuning = getUnknownTuningIdx(); @@ -177,7 +177,7 @@ void KernelBase::recordExecTime() tot_time[running_variant].at(running_tuning) += exec_time; } -void KernelBase::runKernel(VariantID vid, size_t tid) +void KernelBase::runKernel(VariantID vid, size_t tune_idx) { if ( !hasVariantDefined(vid) ) { return; @@ -187,7 +187,7 @@ void KernelBase::runKernel(VariantID vid, size_t tid) case Base_Seq : { - runSeqVariant(vid, tid); + runSeqVariant(vid, tune_idx); break; } @@ -195,7 +195,7 @@ void KernelBase::runKernel(VariantID vid, size_t tid) case RAJA_Seq : { #if defined(RUN_RAJA_SEQ) - runSeqVariant(vid, tid); + runSeqVariant(vid, tune_idx); #endif break; } @@ -205,7 +205,7 @@ void KernelBase::runKernel(VariantID vid, size_t tid) case RAJA_OpenMP : { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - runOpenMPVariant(vid, tid); + runOpenMPVariant(vid, tune_idx); #endif break; } @@ -214,7 +214,7 @@ void KernelBase::runKernel(VariantID vid, size_t tid) case RAJA_OpenMPTarget : { #if defined(RAJA_ENABLE_TARGET_OPENMP) - runOpenMPTargetVariant(vid, tid); + runOpenMPTargetVariant(vid, tune_idx); #endif break; } @@ -224,7 +224,7 @@ void KernelBase::runKernel(VariantID vid, size_t tid) case RAJA_CUDA : { #if defined(RAJA_ENABLE_CUDA) - runCudaVariant(vid, tid); + runCudaVariant(vid, tune_idx); #endif break; } @@ -234,7 +234,7 @@ void KernelBase::runKernel(VariantID vid, size_t tid) case RAJA_HIP : { #if defined(RAJA_ENABLE_HIP) - runHipVariant(vid, tid); + runHipVariant(vid, tune_idx); #endif break; } diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index b194a35f2..8d74d6e05 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -111,9 +111,9 @@ class KernelBase bool hasVariantDefined(VariantID vid) const { return !variant_tuning_names[vid].empty(); } - bool hasVariantTuningDefined(VariantID vid, size_t tid) const + bool hasVariantTuningDefined(VariantID vid, size_t tune_idx) const { - if (hasVariantDefined(vid) && tid < getNumVariantTunings(vid)) { + if (hasVariantDefined(vid) && tune_idx < getNumVariantTunings(vid)) { return true; } return false; @@ -138,8 +138,8 @@ class KernelBase } size_t getNumVariantTunings(VariantID vid) const { return getVariantTuningNames(vid).size(); } - std::string const& getVariantTuningName(VariantID vid, size_t tid) const - { return getVariantTuningNames(vid).at(tid); } + std::string const& getVariantTuningName(VariantID vid, size_t tune_idx) const + { return getVariantTuningNames(vid).at(tune_idx); } std::vector const& getVariantTuningNames(VariantID vid) const { return variant_tuning_names[vid]; } @@ -147,10 +147,10 @@ class KernelBase // Methods to get information about kernel execution for reports // containing kernel execution information // - bool wasVariantTuningRun(VariantID vid, size_t tid) const + bool wasVariantTuningRun(VariantID vid, size_t tune_idx) const { - if (tid != getUnknownTuningIdx()) { - return num_exec[vid].at(tid) > 0; + if (tune_idx != getUnknownTuningIdx()) { + return num_exec[vid].at(tune_idx) > 0; } return false; } @@ -159,12 +159,12 @@ class KernelBase double getLastTime() const { return timer.elapsed(); } // get timers accumulated over npasses - double getMinTime(VariantID vid, size_t tid) const { return min_time[vid].at(tid); } - double getMaxTime(VariantID vid, size_t tid) const { return max_time[vid].at(tid); } - double getTotTime(VariantID vid, size_t tid) { return tot_time[vid].at(tid); } - Checksum_type getChecksum(VariantID vid, size_t tid) const { return checksum[vid].at(tid); } + double getMinTime(VariantID vid, size_t tune_idx) const { return min_time[vid].at(tune_idx); } + double getMaxTime(VariantID vid, size_t tune_idx) const { return max_time[vid].at(tune_idx); } + double getTotTime(VariantID vid, size_t tune_idx) { return tot_time[vid].at(tune_idx); } + Checksum_type getChecksum(VariantID vid, size_t tune_idx) const { return checksum[vid].at(tune_idx); } - void execute(VariantID vid, size_t tid); + void execute(VariantID vid, size_t tune_idx); void synchronize() { @@ -211,24 +211,24 @@ class KernelBase virtual void print(std::ostream& os) const; - virtual void runKernel(VariantID vid, size_t tid); + virtual void runKernel(VariantID vid, size_t tune_idx); - virtual void setUp(VariantID vid, size_t tid) = 0; - virtual void updateChecksum(VariantID vid, size_t tid) = 0; - virtual void tearDown(VariantID vid, size_t tid) = 0; + virtual void setUp(VariantID vid, size_t tune_idx) = 0; + virtual void updateChecksum(VariantID vid, size_t tune_idx) = 0; + virtual void tearDown(VariantID vid, size_t tune_idx) = 0; - virtual void runSeqVariant(VariantID vid, size_t tid) = 0; + virtual void runSeqVariant(VariantID vid, size_t tune_idx) = 0; #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - virtual void runOpenMPVariant(VariantID vid, size_t tid) = 0; + virtual void runOpenMPVariant(VariantID vid, size_t tune_idx) = 0; #endif #if defined(RAJA_ENABLE_CUDA) - virtual void runCudaVariant(VariantID vid, size_t tid) = 0; + virtual void runCudaVariant(VariantID vid, size_t tune_idx) = 0; #endif #if defined(RAJA_ENABLE_HIP) - virtual void runHipVariant(VariantID vid, size_t tid) = 0; + virtual void runHipVariant(VariantID vid, size_t tune_idx) = 0; #endif #if defined(RAJA_ENABLE_TARGET_OPENMP) - virtual void runOpenMPTargetVariant(VariantID vid, size_t tid) = 0; + virtual void runOpenMPTargetVariant(VariantID vid, size_t tune_idx) = 0; #endif protected: diff --git a/src/lcals/DIFF_PREDICT-Cuda.cpp b/src/lcals/DIFF_PREDICT-Cuda.cpp index 7d083fd82..df3b6b7f2 100644 --- a/src/lcals/DIFF_PREDICT-Cuda.cpp +++ b/src/lcals/DIFF_PREDICT-Cuda.cpp @@ -92,13 +92,13 @@ void DIFF_PREDICT::runCudaVariantImpl(VariantID vid) } } -void DIFF_PREDICT::runCudaVariant(VariantID vid, size_t tid) +void DIFF_PREDICT::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/lcals/DIFF_PREDICT-Hip.cpp b/src/lcals/DIFF_PREDICT-Hip.cpp index d61ef2251..ce14aa340 100644 --- a/src/lcals/DIFF_PREDICT-Hip.cpp +++ b/src/lcals/DIFF_PREDICT-Hip.cpp @@ -92,13 +92,13 @@ void DIFF_PREDICT::runHipVariantImpl(VariantID vid) } } -void DIFF_PREDICT::runHipVariant(VariantID vid, size_t tid) +void DIFF_PREDICT::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/lcals/DIFF_PREDICT-OMP.cpp b/src/lcals/DIFF_PREDICT-OMP.cpp index b416bc7a6..2175d7f61 100644 --- a/src/lcals/DIFF_PREDICT-OMP.cpp +++ b/src/lcals/DIFF_PREDICT-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void DIFF_PREDICT::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void DIFF_PREDICT::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/DIFF_PREDICT-OMPTarget.cpp b/src/lcals/DIFF_PREDICT-OMPTarget.cpp index e923202b5..809138bec 100644 --- a/src/lcals/DIFF_PREDICT-OMPTarget.cpp +++ b/src/lcals/DIFF_PREDICT-OMPTarget.cpp @@ -39,7 +39,7 @@ namespace lcals deallocOpenMPDeviceData(cx, did); -void DIFF_PREDICT::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void DIFF_PREDICT::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/DIFF_PREDICT-Seq.cpp b/src/lcals/DIFF_PREDICT-Seq.cpp index f96bb9095..959a79324 100644 --- a/src/lcals/DIFF_PREDICT-Seq.cpp +++ b/src/lcals/DIFF_PREDICT-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void DIFF_PREDICT::runSeqVariant(VariantID vid, size_t /*tid*/) +void DIFF_PREDICT::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp index 77f6f52b0..5d021409f 100644 --- a/src/lcals/DIFF_PREDICT.cpp +++ b/src/lcals/DIFF_PREDICT.cpp @@ -55,7 +55,7 @@ DIFF_PREDICT::~DIFF_PREDICT() { } -void DIFF_PREDICT::setUp(VariantID vid, size_t /*tid*/) +void DIFF_PREDICT::setUp(VariantID vid, size_t /*tune_idx*/) { m_array_length = getActualProblemSize() * 14; m_offset = getActualProblemSize(); @@ -64,12 +64,12 @@ void DIFF_PREDICT::setUp(VariantID vid, size_t /*tid*/) allocAndInitData(m_cx, m_array_length, vid); } -void DIFF_PREDICT::updateChecksum(VariantID vid, size_t tid) +void DIFF_PREDICT::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_px, m_array_length); + checksum[vid][tune_idx] += calcChecksum(m_px, m_array_length); } -void DIFF_PREDICT::tearDown(VariantID vid, size_t /*tid*/) +void DIFF_PREDICT::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_px); diff --git a/src/lcals/DIFF_PREDICT.hpp b/src/lcals/DIFF_PREDICT.hpp index dae6b97f6..f28054f40 100644 --- a/src/lcals/DIFF_PREDICT.hpp +++ b/src/lcals/DIFF_PREDICT.hpp @@ -84,15 +84,15 @@ class DIFF_PREDICT : public KernelBase ~DIFF_PREDICT(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/lcals/EOS-Cuda.cpp b/src/lcals/EOS-Cuda.cpp index 13601b06f..deefa8c07 100644 --- a/src/lcals/EOS-Cuda.cpp +++ b/src/lcals/EOS-Cuda.cpp @@ -96,13 +96,13 @@ void EOS::runCudaVariantImpl(VariantID vid) } } -void EOS::runCudaVariant(VariantID vid, size_t tid) +void EOS::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/lcals/EOS-Hip.cpp b/src/lcals/EOS-Hip.cpp index b502125ac..58e9bbd61 100644 --- a/src/lcals/EOS-Hip.cpp +++ b/src/lcals/EOS-Hip.cpp @@ -96,13 +96,13 @@ void EOS::runHipVariantImpl(VariantID vid) } } -void EOS::runHipVariant(VariantID vid, size_t tid) +void EOS::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/lcals/EOS-OMP.cpp b/src/lcals/EOS-OMP.cpp index 5f0f871ec..389fa68db 100644 --- a/src/lcals/EOS-OMP.cpp +++ b/src/lcals/EOS-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void EOS::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void EOS::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/EOS-OMPTarget.cpp b/src/lcals/EOS-OMPTarget.cpp index 290759924..be52900f7 100644 --- a/src/lcals/EOS-OMPTarget.cpp +++ b/src/lcals/EOS-OMPTarget.cpp @@ -43,7 +43,7 @@ namespace lcals deallocOpenMPDeviceData(u, did); -void EOS::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void EOS::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/EOS-Seq.cpp b/src/lcals/EOS-Seq.cpp index ef0a4d3d3..1bb362b70 100644 --- a/src/lcals/EOS-Seq.cpp +++ b/src/lcals/EOS-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void EOS::runSeqVariant(VariantID vid, size_t /*tid*/) +void EOS::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp index 2515b0954..7e2aaad73 100644 --- a/src/lcals/EOS.cpp +++ b/src/lcals/EOS.cpp @@ -63,7 +63,7 @@ EOS::~EOS() { } -void EOS::setUp(VariantID vid, size_t /*tid*/) +void EOS::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_x, m_array_length, 0.0, vid); allocAndInitData(m_y, m_array_length, vid); @@ -75,12 +75,12 @@ void EOS::setUp(VariantID vid, size_t /*tid*/) initData(m_t, vid); } -void EOS::updateChecksum(VariantID vid, size_t tid) +void EOS::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_x, getActualProblemSize(), checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize(), checksum_scale_factor ); } -void EOS::tearDown(VariantID vid, size_t /*tid*/) +void EOS::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_x); diff --git a/src/lcals/EOS.hpp b/src/lcals/EOS.hpp index 6e653c144..efe36163a 100644 --- a/src/lcals/EOS.hpp +++ b/src/lcals/EOS.hpp @@ -53,15 +53,15 @@ class EOS : public KernelBase ~EOS(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/lcals/FIRST_DIFF-Cuda.cpp b/src/lcals/FIRST_DIFF-Cuda.cpp index 058f0b65b..4f375c2f4 100644 --- a/src/lcals/FIRST_DIFF-Cuda.cpp +++ b/src/lcals/FIRST_DIFF-Cuda.cpp @@ -90,13 +90,13 @@ void FIRST_DIFF::runCudaVariantImpl(VariantID vid) } } -void FIRST_DIFF::runCudaVariant(VariantID vid, size_t tid) +void FIRST_DIFF::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/lcals/FIRST_DIFF-Hip.cpp b/src/lcals/FIRST_DIFF-Hip.cpp index 729e4bbd8..0f6161bbc 100644 --- a/src/lcals/FIRST_DIFF-Hip.cpp +++ b/src/lcals/FIRST_DIFF-Hip.cpp @@ -90,13 +90,13 @@ void FIRST_DIFF::runHipVariantImpl(VariantID vid) } } -void FIRST_DIFF::runHipVariant(VariantID vid, size_t tid) +void FIRST_DIFF::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/lcals/FIRST_DIFF-OMP.cpp b/src/lcals/FIRST_DIFF-OMP.cpp index 2aaa5912e..88ffe72e8 100644 --- a/src/lcals/FIRST_DIFF-OMP.cpp +++ b/src/lcals/FIRST_DIFF-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void FIRST_DIFF::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void FIRST_DIFF::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/FIRST_DIFF-OMPTarget.cpp b/src/lcals/FIRST_DIFF-OMPTarget.cpp index bc9bf44ab..a40e67cd1 100644 --- a/src/lcals/FIRST_DIFF-OMPTarget.cpp +++ b/src/lcals/FIRST_DIFF-OMPTarget.cpp @@ -39,7 +39,7 @@ namespace lcals deallocOpenMPDeviceData(y, did); -void FIRST_DIFF::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void FIRST_DIFF::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/FIRST_DIFF-Seq.cpp b/src/lcals/FIRST_DIFF-Seq.cpp index 4eafcec7c..d30fc35eb 100644 --- a/src/lcals/FIRST_DIFF-Seq.cpp +++ b/src/lcals/FIRST_DIFF-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void FIRST_DIFF::runSeqVariant(VariantID vid, size_t /*tid*/) +void FIRST_DIFF::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp index ce7e41419..340f2cba9 100644 --- a/src/lcals/FIRST_DIFF.cpp +++ b/src/lcals/FIRST_DIFF.cpp @@ -59,18 +59,18 @@ FIRST_DIFF::~FIRST_DIFF() { } -void FIRST_DIFF::setUp(VariantID vid, size_t /*tid*/) +void FIRST_DIFF::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_x, m_N, 0.0, vid); allocAndInitData(m_y, m_N, vid); } -void FIRST_DIFF::updateChecksum(VariantID vid, size_t tid) +void FIRST_DIFF::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_x, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize()); } -void FIRST_DIFF::tearDown(VariantID vid, size_t /*tid*/) +void FIRST_DIFF::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_x); diff --git a/src/lcals/FIRST_DIFF.hpp b/src/lcals/FIRST_DIFF.hpp index 8c9241758..9f31800d7 100644 --- a/src/lcals/FIRST_DIFF.hpp +++ b/src/lcals/FIRST_DIFF.hpp @@ -43,15 +43,15 @@ class FIRST_DIFF : public KernelBase ~FIRST_DIFF(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp index 8a348e25f..1dc35a255 100644 --- a/src/lcals/FIRST_MIN-Cuda.cpp +++ b/src/lcals/FIRST_MIN-Cuda.cpp @@ -130,13 +130,13 @@ void FIRST_MIN::runCudaVariantImpl(VariantID vid) } } -void FIRST_MIN::runCudaVariant(VariantID vid, size_t tid) +void FIRST_MIN::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp index b154907f0..1c09f9cb0 100644 --- a/src/lcals/FIRST_MIN-Hip.cpp +++ b/src/lcals/FIRST_MIN-Hip.cpp @@ -130,13 +130,13 @@ void FIRST_MIN::runHipVariantImpl(VariantID vid) } } -void FIRST_MIN::runHipVariant(VariantID vid, size_t tid) +void FIRST_MIN::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/lcals/FIRST_MIN-OMP.cpp b/src/lcals/FIRST_MIN-OMP.cpp index 7c697cd4a..7940dd4d2 100644 --- a/src/lcals/FIRST_MIN-OMP.cpp +++ b/src/lcals/FIRST_MIN-OMP.cpp @@ -19,7 +19,7 @@ namespace lcals FIRST_MIN_MINLOC_COMPARE; -void FIRST_MIN::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void FIRST_MIN::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/FIRST_MIN-OMPTarget.cpp b/src/lcals/FIRST_MIN-OMPTarget.cpp index 6f9334bcc..4d9c6e1c2 100644 --- a/src/lcals/FIRST_MIN-OMPTarget.cpp +++ b/src/lcals/FIRST_MIN-OMPTarget.cpp @@ -37,7 +37,7 @@ namespace lcals FIRST_MIN_MINLOC_COMPARE; -void FIRST_MIN::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void FIRST_MIN::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/FIRST_MIN-Seq.cpp b/src/lcals/FIRST_MIN-Seq.cpp index 2edaa4167..8bb0e7fb6 100644 --- a/src/lcals/FIRST_MIN-Seq.cpp +++ b/src/lcals/FIRST_MIN-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void FIRST_MIN::runSeqVariant(VariantID vid, size_t /*tid*/) +void FIRST_MIN::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp index c05010a9b..c5c20eb4d 100644 --- a/src/lcals/FIRST_MIN.cpp +++ b/src/lcals/FIRST_MIN.cpp @@ -63,7 +63,7 @@ FIRST_MIN::~FIRST_MIN() { } -void FIRST_MIN::setUp(VariantID vid, size_t /*tid*/) +void FIRST_MIN::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_x, m_N, 0.0, vid); m_x[ m_N / 2 ] = -1.0e+10; @@ -72,12 +72,12 @@ void FIRST_MIN::setUp(VariantID vid, size_t /*tid*/) m_minloc = -1; } -void FIRST_MIN::updateChecksum(VariantID vid, size_t tid) +void FIRST_MIN::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += static_cast(m_minloc); + checksum[vid][tune_idx] += static_cast(m_minloc); } -void FIRST_MIN::tearDown(VariantID vid, size_t /*tid*/) +void FIRST_MIN::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_x); diff --git a/src/lcals/FIRST_MIN.hpp b/src/lcals/FIRST_MIN.hpp index 6ecaed67f..9cd7dd25f 100644 --- a/src/lcals/FIRST_MIN.hpp +++ b/src/lcals/FIRST_MIN.hpp @@ -72,15 +72,15 @@ class FIRST_MIN : public KernelBase ~FIRST_MIN(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/lcals/FIRST_SUM-Cuda.cpp b/src/lcals/FIRST_SUM-Cuda.cpp index 63ef9ca03..5b968c221 100644 --- a/src/lcals/FIRST_SUM-Cuda.cpp +++ b/src/lcals/FIRST_SUM-Cuda.cpp @@ -90,13 +90,13 @@ void FIRST_SUM::runCudaVariantImpl(VariantID vid) } } -void FIRST_SUM::runCudaVariant(VariantID vid, size_t tid) +void FIRST_SUM::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/lcals/FIRST_SUM-Hip.cpp b/src/lcals/FIRST_SUM-Hip.cpp index f50024c49..01c3eeabf 100644 --- a/src/lcals/FIRST_SUM-Hip.cpp +++ b/src/lcals/FIRST_SUM-Hip.cpp @@ -90,13 +90,13 @@ void FIRST_SUM::runHipVariantImpl(VariantID vid) } } -void FIRST_SUM::runHipVariant(VariantID vid, size_t tid) +void FIRST_SUM::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/lcals/FIRST_SUM-OMP.cpp b/src/lcals/FIRST_SUM-OMP.cpp index 5042f594b..a6810b4fb 100644 --- a/src/lcals/FIRST_SUM-OMP.cpp +++ b/src/lcals/FIRST_SUM-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void FIRST_SUM::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void FIRST_SUM::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/FIRST_SUM-OMPTarget.cpp b/src/lcals/FIRST_SUM-OMPTarget.cpp index a1709052e..5905847e6 100644 --- a/src/lcals/FIRST_SUM-OMPTarget.cpp +++ b/src/lcals/FIRST_SUM-OMPTarget.cpp @@ -39,7 +39,7 @@ namespace lcals deallocOpenMPDeviceData(y, did); -void FIRST_SUM::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void FIRST_SUM::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; diff --git a/src/lcals/FIRST_SUM-Seq.cpp b/src/lcals/FIRST_SUM-Seq.cpp index a9035c3ac..8d40d16c8 100644 --- a/src/lcals/FIRST_SUM-Seq.cpp +++ b/src/lcals/FIRST_SUM-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void FIRST_SUM::runSeqVariant(VariantID vid, size_t /*tid*/) +void FIRST_SUM::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp index 16f7120cd..06d4e125c 100644 --- a/src/lcals/FIRST_SUM.cpp +++ b/src/lcals/FIRST_SUM.cpp @@ -58,18 +58,18 @@ FIRST_SUM::~FIRST_SUM() { } -void FIRST_SUM::setUp(VariantID vid, size_t /*tid*/) +void FIRST_SUM::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_x, m_N, 0.0, vid); allocAndInitData(m_y, m_N, vid); } -void FIRST_SUM::updateChecksum(VariantID vid, size_t tid) +void FIRST_SUM::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_x, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize()); } -void FIRST_SUM::tearDown(VariantID vid, size_t /*tid*/) +void FIRST_SUM::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_x); diff --git a/src/lcals/FIRST_SUM.hpp b/src/lcals/FIRST_SUM.hpp index 14221c261..4889148c7 100644 --- a/src/lcals/FIRST_SUM.hpp +++ b/src/lcals/FIRST_SUM.hpp @@ -46,15 +46,15 @@ class FIRST_SUM : public KernelBase ~FIRST_SUM(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/lcals/GEN_LIN_RECUR-Cuda.cpp b/src/lcals/GEN_LIN_RECUR-Cuda.cpp index cce4bd785..9bec47e46 100644 --- a/src/lcals/GEN_LIN_RECUR-Cuda.cpp +++ b/src/lcals/GEN_LIN_RECUR-Cuda.cpp @@ -119,13 +119,13 @@ void GEN_LIN_RECUR::runCudaVariantImpl(VariantID vid) } } -void GEN_LIN_RECUR::runCudaVariant(VariantID vid, size_t tid) +void GEN_LIN_RECUR::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/lcals/GEN_LIN_RECUR-Hip.cpp b/src/lcals/GEN_LIN_RECUR-Hip.cpp index 4dd90c346..3faaec08f 100644 --- a/src/lcals/GEN_LIN_RECUR-Hip.cpp +++ b/src/lcals/GEN_LIN_RECUR-Hip.cpp @@ -121,13 +121,13 @@ void GEN_LIN_RECUR::runHipVariantImpl(VariantID vid) } } -void GEN_LIN_RECUR::runHipVariant(VariantID vid, size_t tid) +void GEN_LIN_RECUR::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/lcals/GEN_LIN_RECUR-OMP.cpp b/src/lcals/GEN_LIN_RECUR-OMP.cpp index 10291c203..3ca9b7c09 100644 --- a/src/lcals/GEN_LIN_RECUR-OMP.cpp +++ b/src/lcals/GEN_LIN_RECUR-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void GEN_LIN_RECUR::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void GEN_LIN_RECUR::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp b/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp index 08a38d272..e4653e190 100644 --- a/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp +++ b/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp @@ -43,7 +43,7 @@ namespace lcals deallocOpenMPDeviceData(sb, did); -void GEN_LIN_RECUR::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void GEN_LIN_RECUR::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/lcals/GEN_LIN_RECUR-Seq.cpp b/src/lcals/GEN_LIN_RECUR-Seq.cpp index ba2976f64..a748932f4 100644 --- a/src/lcals/GEN_LIN_RECUR-Seq.cpp +++ b/src/lcals/GEN_LIN_RECUR-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void GEN_LIN_RECUR::runSeqVariant(VariantID vid, size_t /*tid*/) +void GEN_LIN_RECUR::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/lcals/GEN_LIN_RECUR.cpp b/src/lcals/GEN_LIN_RECUR.cpp index ca44a6760..3fcb9cb39 100644 --- a/src/lcals/GEN_LIN_RECUR.cpp +++ b/src/lcals/GEN_LIN_RECUR.cpp @@ -63,7 +63,7 @@ GEN_LIN_RECUR::~GEN_LIN_RECUR() { } -void GEN_LIN_RECUR::setUp(VariantID vid, size_t /*tid*/) +void GEN_LIN_RECUR::setUp(VariantID vid, size_t /*tune_idx*/) { m_kb5i = 0; @@ -73,12 +73,12 @@ void GEN_LIN_RECUR::setUp(VariantID vid, size_t /*tid*/) allocAndInitData(m_sb, m_N, vid); } -void GEN_LIN_RECUR::updateChecksum(VariantID vid, size_t tid) +void GEN_LIN_RECUR::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_b5, getActualProblemSize(), checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_b5, getActualProblemSize(), checksum_scale_factor ); } -void GEN_LIN_RECUR::tearDown(VariantID vid, size_t /*tid*/) +void GEN_LIN_RECUR::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_b5); diff --git a/src/lcals/GEN_LIN_RECUR.hpp b/src/lcals/GEN_LIN_RECUR.hpp index 7a4ece4a2..8ed1d3073 100644 --- a/src/lcals/GEN_LIN_RECUR.hpp +++ b/src/lcals/GEN_LIN_RECUR.hpp @@ -67,15 +67,15 @@ class GEN_LIN_RECUR : public KernelBase ~GEN_LIN_RECUR(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/lcals/HYDRO_1D-Cuda.cpp b/src/lcals/HYDRO_1D-Cuda.cpp index eef5936f7..c1a85361d 100644 --- a/src/lcals/HYDRO_1D-Cuda.cpp +++ b/src/lcals/HYDRO_1D-Cuda.cpp @@ -94,13 +94,13 @@ void HYDRO_1D::runCudaVariantImpl(VariantID vid) } } -void HYDRO_1D::runCudaVariant(VariantID vid, size_t tid) +void HYDRO_1D::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/lcals/HYDRO_1D-Hip.cpp b/src/lcals/HYDRO_1D-Hip.cpp index 2554ca41d..391c49a93 100644 --- a/src/lcals/HYDRO_1D-Hip.cpp +++ b/src/lcals/HYDRO_1D-Hip.cpp @@ -94,13 +94,13 @@ void HYDRO_1D::runHipVariantImpl(VariantID vid) } } -void HYDRO_1D::runHipVariant(VariantID vid, size_t tid) +void HYDRO_1D::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/lcals/HYDRO_1D-OMP.cpp b/src/lcals/HYDRO_1D-OMP.cpp index 52dda984f..166362713 100644 --- a/src/lcals/HYDRO_1D-OMP.cpp +++ b/src/lcals/HYDRO_1D-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void HYDRO_1D::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void HYDRO_1D::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/HYDRO_1D-OMPTarget.cpp b/src/lcals/HYDRO_1D-OMPTarget.cpp index 343512cf0..98742124b 100644 --- a/src/lcals/HYDRO_1D-OMPTarget.cpp +++ b/src/lcals/HYDRO_1D-OMPTarget.cpp @@ -41,7 +41,7 @@ namespace lcals deallocOpenMPDeviceData(z, did); \ -void HYDRO_1D::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void HYDRO_1D::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/HYDRO_1D-Seq.cpp b/src/lcals/HYDRO_1D-Seq.cpp index a3f8d279a..59a48e1db 100644 --- a/src/lcals/HYDRO_1D-Seq.cpp +++ b/src/lcals/HYDRO_1D-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void HYDRO_1D::runSeqVariant(VariantID vid, size_t /*tid*/) +void HYDRO_1D::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp index 1204b1e64..88c650de7 100644 --- a/src/lcals/HYDRO_1D.cpp +++ b/src/lcals/HYDRO_1D.cpp @@ -62,7 +62,7 @@ HYDRO_1D::~HYDRO_1D() { } -void HYDRO_1D::setUp(VariantID vid, size_t /*tid*/) +void HYDRO_1D::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_x, m_array_length, 0.0, vid); allocAndInitData(m_y, m_array_length, vid); @@ -73,12 +73,12 @@ void HYDRO_1D::setUp(VariantID vid, size_t /*tid*/) initData(m_t, vid); } -void HYDRO_1D::updateChecksum(VariantID vid, size_t tid) +void HYDRO_1D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_x, getActualProblemSize(), checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize(), checksum_scale_factor ); } -void HYDRO_1D::tearDown(VariantID vid, size_t /*tid*/) +void HYDRO_1D::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_x); diff --git a/src/lcals/HYDRO_1D.hpp b/src/lcals/HYDRO_1D.hpp index 5c7be8e7d..db5d8f8cd 100644 --- a/src/lcals/HYDRO_1D.hpp +++ b/src/lcals/HYDRO_1D.hpp @@ -48,15 +48,15 @@ class HYDRO_1D : public KernelBase ~HYDRO_1D(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/lcals/HYDRO_2D-Cuda.cpp b/src/lcals/HYDRO_2D-Cuda.cpp index 8d68ea7de..30acca046 100644 --- a/src/lcals/HYDRO_2D-Cuda.cpp +++ b/src/lcals/HYDRO_2D-Cuda.cpp @@ -221,13 +221,13 @@ void HYDRO_2D::runCudaVariantImpl(VariantID vid) } } -void HYDRO_2D::runCudaVariant(VariantID vid, size_t tid) +void HYDRO_2D::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/lcals/HYDRO_2D-Hip.cpp b/src/lcals/HYDRO_2D-Hip.cpp index 4b2a1a5d9..1e86920dc 100644 --- a/src/lcals/HYDRO_2D-Hip.cpp +++ b/src/lcals/HYDRO_2D-Hip.cpp @@ -223,13 +223,13 @@ void HYDRO_2D::runHipVariantImpl(VariantID vid) } } -void HYDRO_2D::runHipVariant(VariantID vid, size_t tid) +void HYDRO_2D::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/lcals/HYDRO_2D-OMP.cpp b/src/lcals/HYDRO_2D-OMP.cpp index 8810ecf6b..1557d0015 100644 --- a/src/lcals/HYDRO_2D-OMP.cpp +++ b/src/lcals/HYDRO_2D-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void HYDRO_2D::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void HYDRO_2D::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/HYDRO_2D-OMPTarget.cpp b/src/lcals/HYDRO_2D-OMPTarget.cpp index d34003f24..a0e46a55d 100644 --- a/src/lcals/HYDRO_2D-OMPTarget.cpp +++ b/src/lcals/HYDRO_2D-OMPTarget.cpp @@ -54,7 +54,7 @@ namespace lcals -void HYDRO_2D::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void HYDRO_2D::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type kbeg = 1; diff --git a/src/lcals/HYDRO_2D-Seq.cpp b/src/lcals/HYDRO_2D-Seq.cpp index a005d7fb2..5f669eea3 100644 --- a/src/lcals/HYDRO_2D-Seq.cpp +++ b/src/lcals/HYDRO_2D-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void HYDRO_2D::runSeqVariant(VariantID vid, size_t /*tid*/) +void HYDRO_2D::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type kbeg = 1; diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp index 1298016b4..46c98729c 100644 --- a/src/lcals/HYDRO_2D.cpp +++ b/src/lcals/HYDRO_2D.cpp @@ -77,7 +77,7 @@ HYDRO_2D::~HYDRO_2D() { } -void HYDRO_2D::setUp(VariantID vid, size_t /*tid*/) +void HYDRO_2D::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_zrout, m_array_length, 0.0, vid); allocAndInitDataConst(m_zzout, m_array_length, 0.0, vid); @@ -92,13 +92,13 @@ void HYDRO_2D::setUp(VariantID vid, size_t /*tid*/) allocAndInitData(m_zz, m_array_length, vid); } -void HYDRO_2D::updateChecksum(VariantID vid, size_t tid) +void HYDRO_2D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_zzout, m_array_length, checksum_scale_factor ); - checksum[vid][tid] += calcChecksum(m_zrout, m_array_length, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_zzout, m_array_length, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_zrout, m_array_length, checksum_scale_factor ); } -void HYDRO_2D::tearDown(VariantID vid, size_t /*tid*/) +void HYDRO_2D::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_zrout); diff --git a/src/lcals/HYDRO_2D.hpp b/src/lcals/HYDRO_2D.hpp index f7544fd67..46892698c 100644 --- a/src/lcals/HYDRO_2D.hpp +++ b/src/lcals/HYDRO_2D.hpp @@ -144,15 +144,15 @@ class HYDRO_2D : public KernelBase ~HYDRO_2D(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/lcals/INT_PREDICT-Cuda.cpp b/src/lcals/INT_PREDICT-Cuda.cpp index 528596c19..7e6e76158 100644 --- a/src/lcals/INT_PREDICT-Cuda.cpp +++ b/src/lcals/INT_PREDICT-Cuda.cpp @@ -95,13 +95,13 @@ void INT_PREDICT::runCudaVariantImpl(VariantID vid) } } -void INT_PREDICT::runCudaVariant(VariantID vid, size_t tid) +void INT_PREDICT::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/lcals/INT_PREDICT-Hip.cpp b/src/lcals/INT_PREDICT-Hip.cpp index 2b69d0b58..af5a7f3f6 100644 --- a/src/lcals/INT_PREDICT-Hip.cpp +++ b/src/lcals/INT_PREDICT-Hip.cpp @@ -95,13 +95,13 @@ void INT_PREDICT::runHipVariantImpl(VariantID vid) } } -void INT_PREDICT::runHipVariant(VariantID vid, size_t tid) +void INT_PREDICT::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/lcals/INT_PREDICT-OMP.cpp b/src/lcals/INT_PREDICT-OMP.cpp index 10207ad90..cc06147ff 100644 --- a/src/lcals/INT_PREDICT-OMP.cpp +++ b/src/lcals/INT_PREDICT-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void INT_PREDICT::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void INT_PREDICT::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/INT_PREDICT-OMPTarget.cpp b/src/lcals/INT_PREDICT-OMPTarget.cpp index dfa525aa4..95e91d772 100644 --- a/src/lcals/INT_PREDICT-OMPTarget.cpp +++ b/src/lcals/INT_PREDICT-OMPTarget.cpp @@ -37,7 +37,7 @@ namespace lcals deallocOpenMPDeviceData(px, did); -void INT_PREDICT::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void INT_PREDICT::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/INT_PREDICT-Seq.cpp b/src/lcals/INT_PREDICT-Seq.cpp index 6ff804fed..e5b648fd6 100644 --- a/src/lcals/INT_PREDICT-Seq.cpp +++ b/src/lcals/INT_PREDICT-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void INT_PREDICT::runSeqVariant(VariantID vid, size_t /*tid*/) +void INT_PREDICT::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp index 344c9a085..d15013b49 100644 --- a/src/lcals/INT_PREDICT.cpp +++ b/src/lcals/INT_PREDICT.cpp @@ -55,7 +55,7 @@ INT_PREDICT::~INT_PREDICT() { } -void INT_PREDICT::setUp(VariantID vid, size_t /*tid*/) +void INT_PREDICT::setUp(VariantID vid, size_t /*tune_idx*/) { m_array_length = getActualProblemSize() * 13; m_offset = getActualProblemSize(); @@ -73,16 +73,16 @@ void INT_PREDICT::setUp(VariantID vid, size_t /*tid*/) initData(m_c0); } -void INT_PREDICT::updateChecksum(VariantID vid, size_t tid) +void INT_PREDICT::updateChecksum(VariantID vid, size_t tune_idx) { for (Index_type i = 0; i < getActualProblemSize(); ++i) { m_px[i] -= m_px_initval; } - checksum[vid][tid] += calcChecksum(m_px, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_px, getActualProblemSize()); } -void INT_PREDICT::tearDown(VariantID vid, size_t /*tid*/) +void INT_PREDICT::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_px); diff --git a/src/lcals/INT_PREDICT.hpp b/src/lcals/INT_PREDICT.hpp index e5f0c7240..0adfaac47 100644 --- a/src/lcals/INT_PREDICT.hpp +++ b/src/lcals/INT_PREDICT.hpp @@ -63,15 +63,15 @@ class INT_PREDICT : public KernelBase ~INT_PREDICT(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/lcals/PLANCKIAN-Cuda.cpp b/src/lcals/PLANCKIAN-Cuda.cpp index d24655dc5..574e37d1a 100644 --- a/src/lcals/PLANCKIAN-Cuda.cpp +++ b/src/lcals/PLANCKIAN-Cuda.cpp @@ -99,13 +99,13 @@ void PLANCKIAN::runCudaVariantImpl(VariantID vid) } } -void PLANCKIAN::runCudaVariant(VariantID vid, size_t tid) +void PLANCKIAN::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/lcals/PLANCKIAN-Hip.cpp b/src/lcals/PLANCKIAN-Hip.cpp index 55fec1378..cb784480e 100644 --- a/src/lcals/PLANCKIAN-Hip.cpp +++ b/src/lcals/PLANCKIAN-Hip.cpp @@ -99,13 +99,13 @@ void PLANCKIAN::runHipVariantImpl(VariantID vid) } } -void PLANCKIAN::runHipVariant(VariantID vid, size_t tid) +void PLANCKIAN::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/lcals/PLANCKIAN-OMP.cpp b/src/lcals/PLANCKIAN-OMP.cpp index 0144f8bd6..98ca0b135 100644 --- a/src/lcals/PLANCKIAN-OMP.cpp +++ b/src/lcals/PLANCKIAN-OMP.cpp @@ -19,7 +19,7 @@ namespace lcals { -void PLANCKIAN::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void PLANCKIAN::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/PLANCKIAN-OMPTarget.cpp b/src/lcals/PLANCKIAN-OMPTarget.cpp index 5b8f6af03..cafbbc708 100644 --- a/src/lcals/PLANCKIAN-OMPTarget.cpp +++ b/src/lcals/PLANCKIAN-OMPTarget.cpp @@ -46,7 +46,7 @@ namespace lcals deallocOpenMPDeviceData(w, did); -void PLANCKIAN::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void PLANCKIAN::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/PLANCKIAN-Seq.cpp b/src/lcals/PLANCKIAN-Seq.cpp index 8fd55105b..61db64ef6 100644 --- a/src/lcals/PLANCKIAN-Seq.cpp +++ b/src/lcals/PLANCKIAN-Seq.cpp @@ -19,7 +19,7 @@ namespace lcals { -void PLANCKIAN::runSeqVariant(VariantID vid, size_t /*tid*/) +void PLANCKIAN::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp index efff9d6f8..439eeaa52 100644 --- a/src/lcals/PLANCKIAN.cpp +++ b/src/lcals/PLANCKIAN.cpp @@ -55,7 +55,7 @@ PLANCKIAN::~PLANCKIAN() { } -void PLANCKIAN::setUp(VariantID vid, size_t /*tid*/) +void PLANCKIAN::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitData(m_x, getActualProblemSize(), vid); allocAndInitData(m_y, getActualProblemSize(), vid); @@ -64,12 +64,12 @@ void PLANCKIAN::setUp(VariantID vid, size_t /*tid*/) allocAndInitDataConst(m_w, getActualProblemSize(), 0.0, vid); } -void PLANCKIAN::updateChecksum(VariantID vid, size_t tid) +void PLANCKIAN::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_w, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_w, getActualProblemSize()); } -void PLANCKIAN::tearDown(VariantID vid, size_t /*tid*/) +void PLANCKIAN::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_x); diff --git a/src/lcals/PLANCKIAN.hpp b/src/lcals/PLANCKIAN.hpp index 02e9d5f26..632d11b86 100644 --- a/src/lcals/PLANCKIAN.hpp +++ b/src/lcals/PLANCKIAN.hpp @@ -48,15 +48,15 @@ class PLANCKIAN : public KernelBase ~PLANCKIAN(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/lcals/TRIDIAG_ELIM-Cuda.cpp b/src/lcals/TRIDIAG_ELIM-Cuda.cpp index 55e06a2f3..3d72d7d9a 100644 --- a/src/lcals/TRIDIAG_ELIM-Cuda.cpp +++ b/src/lcals/TRIDIAG_ELIM-Cuda.cpp @@ -95,13 +95,13 @@ void TRIDIAG_ELIM::runCudaVariantImpl(VariantID vid) } } -void TRIDIAG_ELIM::runCudaVariant(VariantID vid, size_t tid) +void TRIDIAG_ELIM::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/lcals/TRIDIAG_ELIM-Hip.cpp b/src/lcals/TRIDIAG_ELIM-Hip.cpp index 9b21aa1b6..867c1fefa 100644 --- a/src/lcals/TRIDIAG_ELIM-Hip.cpp +++ b/src/lcals/TRIDIAG_ELIM-Hip.cpp @@ -94,13 +94,13 @@ void TRIDIAG_ELIM::runHipVariantImpl(VariantID vid) } } -void TRIDIAG_ELIM::runHipVariant(VariantID vid, size_t tid) +void TRIDIAG_ELIM::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/lcals/TRIDIAG_ELIM-OMP.cpp b/src/lcals/TRIDIAG_ELIM-OMP.cpp index 9c9e09aaf..0d2671ddc 100644 --- a/src/lcals/TRIDIAG_ELIM-OMP.cpp +++ b/src/lcals/TRIDIAG_ELIM-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void TRIDIAG_ELIM::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void TRIDIAG_ELIM::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp b/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp index 73f90d50b..0d4190b26 100644 --- a/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp +++ b/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp @@ -43,7 +43,7 @@ namespace lcals deallocOpenMPDeviceData(z, did); -void TRIDIAG_ELIM::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void TRIDIAG_ELIM::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; diff --git a/src/lcals/TRIDIAG_ELIM-Seq.cpp b/src/lcals/TRIDIAG_ELIM-Seq.cpp index 206e6dbcf..dc32f6356 100644 --- a/src/lcals/TRIDIAG_ELIM-Seq.cpp +++ b/src/lcals/TRIDIAG_ELIM-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void TRIDIAG_ELIM::runSeqVariant(VariantID vid, size_t /*tid*/) +void TRIDIAG_ELIM::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp index a3085d6fe..8288351d1 100644 --- a/src/lcals/TRIDIAG_ELIM.cpp +++ b/src/lcals/TRIDIAG_ELIM.cpp @@ -57,7 +57,7 @@ TRIDIAG_ELIM::~TRIDIAG_ELIM() { } -void TRIDIAG_ELIM::setUp(VariantID vid, size_t /*tid*/) +void TRIDIAG_ELIM::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_xout, m_N, 0.0, vid); allocAndInitData(m_xin, m_N, vid); @@ -65,12 +65,12 @@ void TRIDIAG_ELIM::setUp(VariantID vid, size_t /*tid*/) allocAndInitData(m_z, m_N, vid); } -void TRIDIAG_ELIM::updateChecksum(VariantID vid, size_t tid) +void TRIDIAG_ELIM::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_xout, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_xout, getActualProblemSize()); } -void TRIDIAG_ELIM::tearDown(VariantID vid, size_t /*tid*/) +void TRIDIAG_ELIM::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_xout); diff --git a/src/lcals/TRIDIAG_ELIM.hpp b/src/lcals/TRIDIAG_ELIM.hpp index 92c455560..fe44df691 100644 --- a/src/lcals/TRIDIAG_ELIM.hpp +++ b/src/lcals/TRIDIAG_ELIM.hpp @@ -48,15 +48,15 @@ class TRIDIAG_ELIM : public KernelBase ~TRIDIAG_ELIM(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/polybench/POLYBENCH_2MM-Cuda.cpp b/src/polybench/POLYBENCH_2MM-Cuda.cpp index a04d7ca98..ae33e8c12 100644 --- a/src/polybench/POLYBENCH_2MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_2MM-Cuda.cpp @@ -275,13 +275,13 @@ void POLYBENCH_2MM::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_2MM::runCudaVariant(VariantID vid, size_t tid) +void POLYBENCH_2MM::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_2MM-Hip.cpp b/src/polybench/POLYBENCH_2MM-Hip.cpp index 8bbcb6492..2bf6f1912 100644 --- a/src/polybench/POLYBENCH_2MM-Hip.cpp +++ b/src/polybench/POLYBENCH_2MM-Hip.cpp @@ -278,13 +278,13 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_2MM::runHipVariant(VariantID vid, size_t tid) +void POLYBENCH_2MM::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_2MM-OMP.cpp b/src/polybench/POLYBENCH_2MM-OMP.cpp index 78f240c64..2d4066a8a 100644 --- a/src/polybench/POLYBENCH_2MM-OMP.cpp +++ b/src/polybench/POLYBENCH_2MM-OMP.cpp @@ -26,7 +26,7 @@ namespace polybench { -void POLYBENCH_2MM::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_2MM::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_2MM-OMPTarget.cpp b/src/polybench/POLYBENCH_2MM-OMPTarget.cpp index 9cdf96e61..e932fb737 100644 --- a/src/polybench/POLYBENCH_2MM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_2MM-OMPTarget.cpp @@ -41,7 +41,7 @@ namespace polybench deallocOpenMPDeviceData(D, did); -void POLYBENCH_2MM::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_2MM::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_2MM-Seq.cpp b/src/polybench/POLYBENCH_2MM-Seq.cpp index 5b33fc3ae..90e167b82 100644 --- a/src/polybench/POLYBENCH_2MM-Seq.cpp +++ b/src/polybench/POLYBENCH_2MM-Seq.cpp @@ -18,7 +18,7 @@ namespace rajaperf namespace polybench { -void POLYBENCH_2MM::runSeqVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_2MM::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index 2094660ec..3d794dce7 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -84,7 +84,7 @@ POLYBENCH_2MM::~POLYBENCH_2MM() { } -void POLYBENCH_2MM::setUp(VariantID vid, size_t /*tid*/) +void POLYBENCH_2MM::setUp(VariantID vid, size_t /*tune_idx*/) { (void) vid; allocAndInitData(m_tmp, m_ni * m_nj, vid); @@ -94,12 +94,12 @@ void POLYBENCH_2MM::setUp(VariantID vid, size_t /*tid*/) allocAndInitDataConst(m_D, m_ni * m_nl, 0.0, vid); } -void POLYBENCH_2MM::updateChecksum(VariantID vid, size_t tid) +void POLYBENCH_2MM::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_D, m_ni * m_nl, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_D, m_ni * m_nl, checksum_scale_factor ); } -void POLYBENCH_2MM::tearDown(VariantID vid, size_t /*tid*/) +void POLYBENCH_2MM::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_tmp); diff --git a/src/polybench/POLYBENCH_2MM.hpp b/src/polybench/POLYBENCH_2MM.hpp index bd08c642d..27993c3f7 100644 --- a/src/polybench/POLYBENCH_2MM.hpp +++ b/src/polybench/POLYBENCH_2MM.hpp @@ -118,15 +118,15 @@ class POLYBENCH_2MM : public KernelBase ~POLYBENCH_2MM(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/polybench/POLYBENCH_3MM-Cuda.cpp b/src/polybench/POLYBENCH_3MM-Cuda.cpp index 6847c2952..e81105c91 100644 --- a/src/polybench/POLYBENCH_3MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_3MM-Cuda.cpp @@ -353,13 +353,13 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_3MM::runCudaVariant(VariantID vid, size_t tid) +void POLYBENCH_3MM::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_3MM-Hip.cpp b/src/polybench/POLYBENCH_3MM-Hip.cpp index 66501ce5c..9318fa363 100644 --- a/src/polybench/POLYBENCH_3MM-Hip.cpp +++ b/src/polybench/POLYBENCH_3MM-Hip.cpp @@ -358,13 +358,13 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_3MM::runHipVariant(VariantID vid, size_t tid) +void POLYBENCH_3MM::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_3MM-OMP.cpp b/src/polybench/POLYBENCH_3MM-OMP.cpp index 64a20c3e1..edc21e2cd 100644 --- a/src/polybench/POLYBENCH_3MM-OMP.cpp +++ b/src/polybench/POLYBENCH_3MM-OMP.cpp @@ -27,7 +27,7 @@ namespace polybench { -void POLYBENCH_3MM::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_3MM::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_3MM-OMPTarget.cpp b/src/polybench/POLYBENCH_3MM-OMPTarget.cpp index 3f093b288..6326ef629 100644 --- a/src/polybench/POLYBENCH_3MM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_3MM-OMPTarget.cpp @@ -44,7 +44,7 @@ namespace polybench deallocOpenMPDeviceData(F, did); \ deallocOpenMPDeviceData(G, did); -void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_3MM-Seq.cpp b/src/polybench/POLYBENCH_3MM-Seq.cpp index a5ccf90a6..0857ac6b4 100644 --- a/src/polybench/POLYBENCH_3MM-Seq.cpp +++ b/src/polybench/POLYBENCH_3MM-Seq.cpp @@ -20,7 +20,7 @@ namespace polybench { -void POLYBENCH_3MM::runSeqVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_3MM::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp index 7024287ab..08d2179fb 100644 --- a/src/polybench/POLYBENCH_3MM.cpp +++ b/src/polybench/POLYBENCH_3MM.cpp @@ -92,7 +92,7 @@ POLYBENCH_3MM::~POLYBENCH_3MM() { } -void POLYBENCH_3MM::setUp(VariantID vid, size_t /*tid*/) +void POLYBENCH_3MM::setUp(VariantID vid, size_t /*tune_idx*/) { (void) vid; allocAndInitData(m_A, m_ni * m_nk, vid); @@ -104,12 +104,12 @@ void POLYBENCH_3MM::setUp(VariantID vid, size_t /*tid*/) allocAndInitDataConst(m_G, m_ni * m_nl, 0.0, vid); } -void POLYBENCH_3MM::updateChecksum(VariantID vid, size_t tid) +void POLYBENCH_3MM::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_G, m_ni * m_nl, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_G, m_ni * m_nl, checksum_scale_factor ); } -void POLYBENCH_3MM::tearDown(VariantID vid, size_t /*tid*/) +void POLYBENCH_3MM::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_A); diff --git a/src/polybench/POLYBENCH_3MM.hpp b/src/polybench/POLYBENCH_3MM.hpp index 67993f699..df4734df0 100644 --- a/src/polybench/POLYBENCH_3MM.hpp +++ b/src/polybench/POLYBENCH_3MM.hpp @@ -144,15 +144,15 @@ class POLYBENCH_3MM : public KernelBase ~POLYBENCH_3MM(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/polybench/POLYBENCH_ADI-Cuda.cpp b/src/polybench/POLYBENCH_ADI-Cuda.cpp index e552c774f..d2c913cf8 100644 --- a/src/polybench/POLYBENCH_ADI-Cuda.cpp +++ b/src/polybench/POLYBENCH_ADI-Cuda.cpp @@ -248,13 +248,13 @@ void POLYBENCH_ADI::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_ADI::runCudaVariant(VariantID vid, size_t tid) +void POLYBENCH_ADI::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_ADI-Hip.cpp b/src/polybench/POLYBENCH_ADI-Hip.cpp index 46bfb9d47..cbc8ea5cb 100644 --- a/src/polybench/POLYBENCH_ADI-Hip.cpp +++ b/src/polybench/POLYBENCH_ADI-Hip.cpp @@ -257,13 +257,13 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_ADI::runHipVariant(VariantID vid, size_t tid) +void POLYBENCH_ADI::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_ADI-OMP.cpp b/src/polybench/POLYBENCH_ADI-OMP.cpp index 3ad69d224..71e9218d6 100644 --- a/src/polybench/POLYBENCH_ADI-OMP.cpp +++ b/src/polybench/POLYBENCH_ADI-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_ADI::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_ADI::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_ADI-OMPTarget.cpp b/src/polybench/POLYBENCH_ADI-OMPTarget.cpp index 7e2fd7295..e2dd834fc 100644 --- a/src/polybench/POLYBENCH_ADI-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_ADI-OMPTarget.cpp @@ -43,7 +43,7 @@ namespace polybench deallocOpenMPDeviceData(Q, did); -void POLYBENCH_ADI::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_ADI::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_ADI-Seq.cpp b/src/polybench/POLYBENCH_ADI-Seq.cpp index a8ef14edb..69ab50df1 100644 --- a/src/polybench/POLYBENCH_ADI-Seq.cpp +++ b/src/polybench/POLYBENCH_ADI-Seq.cpp @@ -18,7 +18,7 @@ namespace rajaperf namespace polybench { -void POLYBENCH_ADI::runSeqVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_ADI::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp index 80471b5ef..3c3fd6a05 100644 --- a/src/polybench/POLYBENCH_ADI.cpp +++ b/src/polybench/POLYBENCH_ADI.cpp @@ -69,7 +69,7 @@ POLYBENCH_ADI::~POLYBENCH_ADI() { } -void POLYBENCH_ADI::setUp(VariantID vid, size_t /*tid*/) +void POLYBENCH_ADI::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_U, m_n * m_n, 0.0, vid); allocAndInitData(m_V, m_n * m_n, vid); @@ -77,12 +77,12 @@ void POLYBENCH_ADI::setUp(VariantID vid, size_t /*tid*/) allocAndInitData(m_Q, m_n * m_n, vid); } -void POLYBENCH_ADI::updateChecksum(VariantID vid, size_t tid) +void POLYBENCH_ADI::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_U, m_n * m_n, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_U, m_n * m_n, checksum_scale_factor ); } -void POLYBENCH_ADI::tearDown(VariantID vid, size_t /*tid*/) +void POLYBENCH_ADI::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_U); diff --git a/src/polybench/POLYBENCH_ADI.hpp b/src/polybench/POLYBENCH_ADI.hpp index b8ff88e1f..aafc643f9 100644 --- a/src/polybench/POLYBENCH_ADI.hpp +++ b/src/polybench/POLYBENCH_ADI.hpp @@ -186,15 +186,15 @@ class POLYBENCH_ADI : public KernelBase ~POLYBENCH_ADI(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/polybench/POLYBENCH_ATAX-Cuda.cpp b/src/polybench/POLYBENCH_ATAX-Cuda.cpp index 8672bc407..02a623129 100644 --- a/src/polybench/POLYBENCH_ATAX-Cuda.cpp +++ b/src/polybench/POLYBENCH_ATAX-Cuda.cpp @@ -230,13 +230,13 @@ void POLYBENCH_ATAX::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_ATAX::runCudaVariant(VariantID vid, size_t tid) +void POLYBENCH_ATAX::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_ATAX-Hip.cpp b/src/polybench/POLYBENCH_ATAX-Hip.cpp index 8fd41edf9..780ff4616 100644 --- a/src/polybench/POLYBENCH_ATAX-Hip.cpp +++ b/src/polybench/POLYBENCH_ATAX-Hip.cpp @@ -237,13 +237,13 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_ATAX::runHipVariant(VariantID vid, size_t tid) +void POLYBENCH_ATAX::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_ATAX-OMP.cpp b/src/polybench/POLYBENCH_ATAX-OMP.cpp index c24a82172..5c9f84400 100644 --- a/src/polybench/POLYBENCH_ATAX-OMP.cpp +++ b/src/polybench/POLYBENCH_ATAX-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_ATAX::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_ATAX::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp b/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp index e04957c9d..154cb6a46 100644 --- a/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp @@ -44,7 +44,7 @@ namespace polybench deallocOpenMPDeviceData(A, did); -void POLYBENCH_ATAX::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_ATAX::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_ATAX-Seq.cpp b/src/polybench/POLYBENCH_ATAX-Seq.cpp index fe898b796..5a52cac03 100644 --- a/src/polybench/POLYBENCH_ATAX-Seq.cpp +++ b/src/polybench/POLYBENCH_ATAX-Seq.cpp @@ -18,7 +18,7 @@ namespace rajaperf namespace polybench { -void POLYBENCH_ATAX::runSeqVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_ATAX::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp index cc54986ad..8a25c1cd7 100644 --- a/src/polybench/POLYBENCH_ATAX.cpp +++ b/src/polybench/POLYBENCH_ATAX.cpp @@ -71,7 +71,7 @@ POLYBENCH_ATAX::~POLYBENCH_ATAX() { } -void POLYBENCH_ATAX::setUp(VariantID vid, size_t /*tid*/) +void POLYBENCH_ATAX::setUp(VariantID vid, size_t /*tune_idx*/) { (void) vid; allocAndInitData(m_tmp, m_N, vid); @@ -80,12 +80,12 @@ void POLYBENCH_ATAX::setUp(VariantID vid, size_t /*tid*/) allocAndInitDataConst(m_y, m_N, 0.0, vid); } -void POLYBENCH_ATAX::updateChecksum(VariantID vid, size_t tid) +void POLYBENCH_ATAX::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_y, m_N, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_y, m_N, checksum_scale_factor ); } -void POLYBENCH_ATAX::tearDown(VariantID vid, size_t /*tid*/) +void POLYBENCH_ATAX::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_tmp); diff --git a/src/polybench/POLYBENCH_ATAX.hpp b/src/polybench/POLYBENCH_ATAX.hpp index ee7de33e1..eeb641b28 100644 --- a/src/polybench/POLYBENCH_ATAX.hpp +++ b/src/polybench/POLYBENCH_ATAX.hpp @@ -106,15 +106,15 @@ class POLYBENCH_ATAX : public KernelBase ~POLYBENCH_ATAX(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp index 55f50316d..5657674da 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp @@ -320,13 +320,13 @@ void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_FDTD_2D::runCudaVariant(VariantID vid, size_t tid) +void POLYBENCH_FDTD_2D::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp index ace5fecb6..d6012a66f 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp @@ -332,13 +332,13 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_FDTD_2D::runHipVariant(VariantID vid, size_t tid) +void POLYBENCH_FDTD_2D::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp b/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp index 1358bab54..57eeddf33 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp @@ -18,7 +18,7 @@ namespace polybench { -void POLYBENCH_FDTD_2D::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_FDTD_2D::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp b/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp index 0416c534b..775451df7 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp @@ -43,7 +43,7 @@ namespace polybench deallocOpenMPDeviceData(fict, did); -void POLYBENCH_FDTD_2D::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_FDTD_2D::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp b/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp index 90d64f340..d1759d8a5 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp @@ -18,7 +18,7 @@ namespace polybench { -void POLYBENCH_FDTD_2D::runSeqVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_FDTD_2D::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp index c23a2f300..3ace1b816 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D.cpp @@ -90,7 +90,7 @@ POLYBENCH_FDTD_2D::~POLYBENCH_FDTD_2D() { } -void POLYBENCH_FDTD_2D::setUp(VariantID vid, size_t /*tid*/) +void POLYBENCH_FDTD_2D::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_hz, m_nx * m_ny, 0.0, vid); allocAndInitData(m_ex, m_nx * m_ny, vid); @@ -98,12 +98,12 @@ void POLYBENCH_FDTD_2D::setUp(VariantID vid, size_t /*tid*/) allocAndInitData(m_fict, m_tsteps, vid); } -void POLYBENCH_FDTD_2D::updateChecksum(VariantID vid, size_t tid) +void POLYBENCH_FDTD_2D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_hz, m_nx * m_ny, checksum_scale_factor); + checksum[vid][tune_idx] += calcChecksum(m_hz, m_nx * m_ny, checksum_scale_factor); } -void POLYBENCH_FDTD_2D::tearDown(VariantID vid, size_t /*tid*/) +void POLYBENCH_FDTD_2D::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_fict); diff --git a/src/polybench/POLYBENCH_FDTD_2D.hpp b/src/polybench/POLYBENCH_FDTD_2D.hpp index 69963d8d8..c9133c4a6 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.hpp +++ b/src/polybench/POLYBENCH_FDTD_2D.hpp @@ -104,15 +104,15 @@ class POLYBENCH_FDTD_2D : public KernelBase ~POLYBENCH_FDTD_2D(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp index 0ba8c708c..be2e3ccda 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp @@ -179,13 +179,13 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_FLOYD_WARSHALL::runCudaVariant(VariantID vid, size_t tid) +void POLYBENCH_FLOYD_WARSHALL::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp index 78fbd2098..5d9999a72 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp @@ -183,13 +183,13 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_FLOYD_WARSHALL::runHipVariant(VariantID vid, size_t tid) +void POLYBENCH_FLOYD_WARSHALL::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp index 9e4bfa268..730a55911 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp @@ -24,7 +24,7 @@ namespace polybench { -void POLYBENCH_FLOYD_WARSHALL::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_FLOYD_WARSHALL::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp index 532de113a..ff70686a5 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp @@ -35,7 +35,7 @@ namespace polybench deallocOpenMPDeviceData(pout, did); -void POLYBENCH_FLOYD_WARSHALL::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_FLOYD_WARSHALL::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp index bee7ff9fe..0c298b9b7 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp @@ -18,7 +18,7 @@ namespace polybench { -void POLYBENCH_FLOYD_WARSHALL::runSeqVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_FLOYD_WARSHALL::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp index aae0fbfa9..c10cc9427 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp @@ -66,19 +66,19 @@ POLYBENCH_FLOYD_WARSHALL::~POLYBENCH_FLOYD_WARSHALL() { } -void POLYBENCH_FLOYD_WARSHALL::setUp(VariantID vid, size_t /*tid*/) +void POLYBENCH_FLOYD_WARSHALL::setUp(VariantID vid, size_t /*tune_idx*/) { (void) vid; allocAndInitDataRandSign(m_pin, m_N*m_N, vid); allocAndInitDataConst(m_pout, m_N*m_N, 0.0, vid); } -void POLYBENCH_FLOYD_WARSHALL::updateChecksum(VariantID vid, size_t tid) +void POLYBENCH_FLOYD_WARSHALL::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_pout, m_N*m_N, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_pout, m_N*m_N, checksum_scale_factor ); } -void POLYBENCH_FLOYD_WARSHALL::tearDown(VariantID vid, size_t /*tid*/) +void POLYBENCH_FLOYD_WARSHALL::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_pin); diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp index 89db75951..261d7818b 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp @@ -67,15 +67,15 @@ class POLYBENCH_FLOYD_WARSHALL : public KernelBase ~POLYBENCH_FLOYD_WARSHALL(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/polybench/POLYBENCH_GEMM-Cuda.cpp b/src/polybench/POLYBENCH_GEMM-Cuda.cpp index f5276210e..e5818f3e3 100644 --- a/src/polybench/POLYBENCH_GEMM-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMM-Cuda.cpp @@ -205,13 +205,13 @@ void POLYBENCH_GEMM::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_GEMM::runCudaVariant(VariantID vid, size_t tid) +void POLYBENCH_GEMM::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_GEMM-Hip.cpp b/src/polybench/POLYBENCH_GEMM-Hip.cpp index 2f31c997c..4d99c5b3a 100644 --- a/src/polybench/POLYBENCH_GEMM-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMM-Hip.cpp @@ -206,13 +206,13 @@ void POLYBENCH_GEMM::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_GEMM::runHipVariant(VariantID vid, size_t tid) +void POLYBENCH_GEMM::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_GEMM-OMP.cpp b/src/polybench/POLYBENCH_GEMM-OMP.cpp index 46e91c742..171e60082 100644 --- a/src/polybench/POLYBENCH_GEMM-OMP.cpp +++ b/src/polybench/POLYBENCH_GEMM-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_GEMM::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_GEMM::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp b/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp index c77cb3d41..37ef4fbb4 100644 --- a/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp @@ -37,7 +37,7 @@ namespace polybench deallocOpenMPDeviceData(C, did); -void POLYBENCH_GEMM::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_GEMM::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_GEMM-Seq.cpp b/src/polybench/POLYBENCH_GEMM-Seq.cpp index c4fbe8270..f45741471 100644 --- a/src/polybench/POLYBENCH_GEMM-Seq.cpp +++ b/src/polybench/POLYBENCH_GEMM-Seq.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_GEMM::runSeqVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_GEMM::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp index 21da7486e..8d52c7002 100644 --- a/src/polybench/POLYBENCH_GEMM.cpp +++ b/src/polybench/POLYBENCH_GEMM.cpp @@ -76,7 +76,7 @@ POLYBENCH_GEMM::~POLYBENCH_GEMM() { } -void POLYBENCH_GEMM::setUp(VariantID vid, size_t /*tid*/) +void POLYBENCH_GEMM::setUp(VariantID vid, size_t /*tune_idx*/) { (void) vid; allocAndInitData(m_A, m_ni * m_nk, vid); @@ -84,12 +84,12 @@ void POLYBENCH_GEMM::setUp(VariantID vid, size_t /*tid*/) allocAndInitDataConst(m_C, m_ni * m_nj, 0.0, vid); } -void POLYBENCH_GEMM::updateChecksum(VariantID vid, size_t tid) +void POLYBENCH_GEMM::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_C, m_ni * m_nj, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_C, m_ni * m_nj, checksum_scale_factor ); } -void POLYBENCH_GEMM::tearDown(VariantID vid, size_t /*tid*/) +void POLYBENCH_GEMM::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_A); diff --git a/src/polybench/POLYBENCH_GEMM.hpp b/src/polybench/POLYBENCH_GEMM.hpp index 44378c8ab..92c400583 100644 --- a/src/polybench/POLYBENCH_GEMM.hpp +++ b/src/polybench/POLYBENCH_GEMM.hpp @@ -90,15 +90,15 @@ class POLYBENCH_GEMM : public KernelBase ~POLYBENCH_GEMM(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp index f31559601..aaf7f765f 100644 --- a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp @@ -342,13 +342,13 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_GEMVER::runCudaVariant(VariantID vid, size_t tid) +void POLYBENCH_GEMVER::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_GEMVER-Hip.cpp b/src/polybench/POLYBENCH_GEMVER-Hip.cpp index 67a4e8f8d..efaeebd0f 100644 --- a/src/polybench/POLYBENCH_GEMVER-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Hip.cpp @@ -350,13 +350,13 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_GEMVER::runHipVariant(VariantID vid, size_t tid) +void POLYBENCH_GEMVER::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_GEMVER-OMP.cpp b/src/polybench/POLYBENCH_GEMVER-OMP.cpp index ba09e8ee7..d351b26e4 100644 --- a/src/polybench/POLYBENCH_GEMVER-OMP.cpp +++ b/src/polybench/POLYBENCH_GEMVER-OMP.cpp @@ -20,7 +20,7 @@ namespace polybench { -void POLYBENCH_GEMVER::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_GEMVER::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp b/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp index 0afb7f137..78c94db76 100644 --- a/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp @@ -54,7 +54,7 @@ namespace polybench -void POLYBENCH_GEMVER::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_GEMVER::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_GEMVER-Seq.cpp b/src/polybench/POLYBENCH_GEMVER-Seq.cpp index 3b33a3e71..48bd8dba8 100644 --- a/src/polybench/POLYBENCH_GEMVER-Seq.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Seq.cpp @@ -20,7 +20,7 @@ namespace polybench { -void POLYBENCH_GEMVER::runSeqVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_GEMVER::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp index 020362eb9..917801777 100644 --- a/src/polybench/POLYBENCH_GEMVER.cpp +++ b/src/polybench/POLYBENCH_GEMVER.cpp @@ -85,7 +85,7 @@ POLYBENCH_GEMVER::~POLYBENCH_GEMVER() { } -void POLYBENCH_GEMVER::setUp(VariantID vid, size_t /*tid*/) +void POLYBENCH_GEMVER::setUp(VariantID vid, size_t /*tune_idx*/) { (void) vid; @@ -100,12 +100,12 @@ void POLYBENCH_GEMVER::setUp(VariantID vid, size_t /*tid*/) allocAndInitData(m_z, m_n, vid); } -void POLYBENCH_GEMVER::updateChecksum(VariantID vid, size_t tid) +void POLYBENCH_GEMVER::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_w, m_n, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_w, m_n, checksum_scale_factor ); } -void POLYBENCH_GEMVER::tearDown(VariantID vid, size_t /*tid*/) +void POLYBENCH_GEMVER::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_A); diff --git a/src/polybench/POLYBENCH_GEMVER.hpp b/src/polybench/POLYBENCH_GEMVER.hpp index 8008e0033..97a6c6d2c 100644 --- a/src/polybench/POLYBENCH_GEMVER.hpp +++ b/src/polybench/POLYBENCH_GEMVER.hpp @@ -143,15 +143,15 @@ class POLYBENCH_GEMVER : public KernelBase ~POLYBENCH_GEMVER(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp index 890540da4..c29a5fa3a 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp @@ -137,13 +137,13 @@ void POLYBENCH_GESUMMV::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_GESUMMV::runCudaVariant(VariantID vid, size_t tid) +void POLYBENCH_GESUMMV::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp index 68df2cb76..642b20ee4 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp @@ -139,13 +139,13 @@ void POLYBENCH_GESUMMV::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_GESUMMV::runHipVariant(VariantID vid, size_t tid) +void POLYBENCH_GESUMMV::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_GESUMMV-OMP.cpp b/src/polybench/POLYBENCH_GESUMMV-OMP.cpp index b249ddda3..2e111fb4a 100644 --- a/src/polybench/POLYBENCH_GESUMMV-OMP.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_GESUMMV::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_GESUMMV::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp b/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp index 292271ef5..b5d459d25 100644 --- a/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp @@ -44,7 +44,7 @@ namespace polybench deallocOpenMPDeviceData(B, did); -void POLYBENCH_GESUMMV::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_GESUMMV::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_GESUMMV-Seq.cpp b/src/polybench/POLYBENCH_GESUMMV-Seq.cpp index 6995d3d4d..bccb3fa90 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Seq.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Seq.cpp @@ -18,7 +18,7 @@ namespace rajaperf namespace polybench { -void POLYBENCH_GESUMMV::runSeqVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_GESUMMV::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp index 5467f4cbc..9fc65f0bd 100644 --- a/src/polybench/POLYBENCH_GESUMMV.cpp +++ b/src/polybench/POLYBENCH_GESUMMV.cpp @@ -65,7 +65,7 @@ POLYBENCH_GESUMMV::~POLYBENCH_GESUMMV() { } -void POLYBENCH_GESUMMV::setUp(VariantID vid, size_t /*tid*/) +void POLYBENCH_GESUMMV::setUp(VariantID vid, size_t /*tune_idx*/) { (void) vid; allocAndInitData(m_x, m_N, vid); @@ -74,12 +74,12 @@ void POLYBENCH_GESUMMV::setUp(VariantID vid, size_t /*tid*/) allocAndInitData(m_B, m_N * m_N, vid); } -void POLYBENCH_GESUMMV::updateChecksum(VariantID vid, size_t tid) +void POLYBENCH_GESUMMV::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_y, m_N); + checksum[vid][tune_idx] += calcChecksum(m_y, m_N); } -void POLYBENCH_GESUMMV::tearDown(VariantID vid, size_t /*tid*/) +void POLYBENCH_GESUMMV::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_x); diff --git a/src/polybench/POLYBENCH_GESUMMV.hpp b/src/polybench/POLYBENCH_GESUMMV.hpp index 612ea2ab5..5139393ed 100644 --- a/src/polybench/POLYBENCH_GESUMMV.hpp +++ b/src/polybench/POLYBENCH_GESUMMV.hpp @@ -89,15 +89,15 @@ class POLYBENCH_GESUMMV : public KernelBase ~POLYBENCH_GESUMMV(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp index e635ebacb..6670f7289 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp @@ -221,13 +221,13 @@ void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_HEAT_3D::runCudaVariant(VariantID vid, size_t tid) +void POLYBENCH_HEAT_3D::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp index 64ac373be..e6cdc193e 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp @@ -227,13 +227,13 @@ void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_HEAT_3D::runHipVariant(VariantID vid, size_t tid) +void POLYBENCH_HEAT_3D::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp b/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp index c713828c9..0ea5b192f 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_HEAT_3D::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_HEAT_3D::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp b/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp index f034c5563..434b9cdeb 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp @@ -36,7 +36,7 @@ namespace polybench deallocOpenMPDeviceData(B, did); -void POLYBENCH_HEAT_3D::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_HEAT_3D::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp b/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp index a2ca945a1..a9bdaa256 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_HEAT_3D::runSeqVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_HEAT_3D::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp index bb5e9897e..8bcc69f20 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D.cpp @@ -76,7 +76,7 @@ POLYBENCH_HEAT_3D::~POLYBENCH_HEAT_3D() { } -void POLYBENCH_HEAT_3D::setUp(VariantID vid, size_t /*tid*/) +void POLYBENCH_HEAT_3D::setUp(VariantID vid, size_t /*tune_idx*/) { (void) vid; allocAndInitData(m_Ainit, m_N*m_N*m_N, vid); @@ -85,13 +85,13 @@ void POLYBENCH_HEAT_3D::setUp(VariantID vid, size_t /*tid*/) allocAndInitDataConst(m_B, m_N*m_N*m_N, 0.0, vid); } -void POLYBENCH_HEAT_3D::updateChecksum(VariantID vid, size_t tid) +void POLYBENCH_HEAT_3D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_A, m_N*m_N*m_N, checksum_scale_factor ); - checksum[vid][tid] += calcChecksum(m_B, m_N*m_N*m_N, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_A, m_N*m_N*m_N, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_B, m_N*m_N*m_N, checksum_scale_factor ); } -void POLYBENCH_HEAT_3D::tearDown(VariantID vid, size_t /*tid*/) +void POLYBENCH_HEAT_3D::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_A); diff --git a/src/polybench/POLYBENCH_HEAT_3D.hpp b/src/polybench/POLYBENCH_HEAT_3D.hpp index 118c2e166..04d8e7b38 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.hpp +++ b/src/polybench/POLYBENCH_HEAT_3D.hpp @@ -115,15 +115,15 @@ class POLYBENCH_HEAT_3D : public KernelBase ~POLYBENCH_HEAT_3D(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp index a590eefb0..d5643e929 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp @@ -120,13 +120,13 @@ void POLYBENCH_JACOBI_1D::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_JACOBI_1D::runCudaVariant(VariantID vid, size_t tid) +void POLYBENCH_JACOBI_1D::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp index ff3398840..346801ca6 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp @@ -122,13 +122,13 @@ void POLYBENCH_JACOBI_1D::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_JACOBI_1D::runHipVariant(VariantID vid, size_t tid) +void POLYBENCH_JACOBI_1D::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp b/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp index d41631827..066466a27 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_JACOBI_1D::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_JACOBI_1D::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp b/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp index b01984553..ada3047ad 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp @@ -41,7 +41,7 @@ namespace polybench deallocOpenMPDeviceData(B, did); -void POLYBENCH_JACOBI_1D::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_JACOBI_1D::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp index cf661b908..65f2c8036 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_JACOBI_1D::runSeqVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_JACOBI_1D::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp index 74740e62c..fd3456866 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp @@ -73,7 +73,7 @@ POLYBENCH_JACOBI_1D::~POLYBENCH_JACOBI_1D() { } -void POLYBENCH_JACOBI_1D::setUp(VariantID vid, size_t /*tid*/) +void POLYBENCH_JACOBI_1D::setUp(VariantID vid, size_t /*tune_idx*/) { (void) vid; allocAndInitData(m_Ainit, m_N, vid); @@ -82,13 +82,13 @@ void POLYBENCH_JACOBI_1D::setUp(VariantID vid, size_t /*tid*/) allocAndInitDataConst(m_B, m_N, 0.0, vid); } -void POLYBENCH_JACOBI_1D::updateChecksum(VariantID vid, size_t tid) +void POLYBENCH_JACOBI_1D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_A, m_N, checksum_scale_factor ); - checksum[vid][tid] += calcChecksum(m_B, m_N, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_A, m_N, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_B, m_N, checksum_scale_factor ); } -void POLYBENCH_JACOBI_1D::tearDown(VariantID vid, size_t /*tid*/) +void POLYBENCH_JACOBI_1D::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_A); diff --git a/src/polybench/POLYBENCH_JACOBI_1D.hpp b/src/polybench/POLYBENCH_JACOBI_1D.hpp index 0c2283963..b26a8248f 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.hpp @@ -61,15 +61,15 @@ class POLYBENCH_JACOBI_1D : public KernelBase ~POLYBENCH_JACOBI_1D(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp index 7c9c26524..a4ed964cb 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp @@ -211,13 +211,13 @@ void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_JACOBI_2D::runCudaVariant(VariantID vid, size_t tid) +void POLYBENCH_JACOBI_2D::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp index bb3f83a93..4755e3a59 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp @@ -217,13 +217,13 @@ void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_JACOBI_2D::runHipVariant(VariantID vid, size_t tid) +void POLYBENCH_JACOBI_2D::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp b/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp index 795ff595a..193ed398c 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_JACOBI_2D::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_JACOBI_2D::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp b/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp index fdf5861ba..50af0d9f7 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp @@ -36,7 +36,7 @@ namespace polybench deallocOpenMPDeviceData(B, did); -void POLYBENCH_JACOBI_2D::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_JACOBI_2D::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp index 4ca50a458..addb6d3ad 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_JACOBI_2D::runSeqVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_JACOBI_2D::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp index db3da2a21..fe8972e9b 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp @@ -75,7 +75,7 @@ POLYBENCH_JACOBI_2D::~POLYBENCH_JACOBI_2D() { } -void POLYBENCH_JACOBI_2D::setUp(VariantID vid, size_t /*tid*/) +void POLYBENCH_JACOBI_2D::setUp(VariantID vid, size_t /*tune_idx*/) { (void) vid; allocAndInitData(m_Ainit, m_N*m_N, vid); @@ -84,13 +84,13 @@ void POLYBENCH_JACOBI_2D::setUp(VariantID vid, size_t /*tid*/) allocAndInitDataConst(m_B, m_N*m_N, 0.0, vid); } -void POLYBENCH_JACOBI_2D::updateChecksum(VariantID vid, size_t tid) +void POLYBENCH_JACOBI_2D::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_A, m_N*m_N, checksum_scale_factor ); - checksum[vid][tid] += calcChecksum(m_B, m_N*m_N, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_A, m_N*m_N, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_B, m_N*m_N, checksum_scale_factor ); } -void POLYBENCH_JACOBI_2D::tearDown(VariantID vid, size_t /*tid*/) +void POLYBENCH_JACOBI_2D::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_A); diff --git a/src/polybench/POLYBENCH_JACOBI_2D.hpp b/src/polybench/POLYBENCH_JACOBI_2D.hpp index 7fd045c9c..3c8611e81 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.hpp @@ -81,15 +81,15 @@ class POLYBENCH_JACOBI_2D : public KernelBase ~POLYBENCH_JACOBI_2D(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/polybench/POLYBENCH_MVT-Cuda.cpp b/src/polybench/POLYBENCH_MVT-Cuda.cpp index c3b817a30..11195118d 100644 --- a/src/polybench/POLYBENCH_MVT-Cuda.cpp +++ b/src/polybench/POLYBENCH_MVT-Cuda.cpp @@ -178,13 +178,13 @@ void POLYBENCH_MVT::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_MVT::runCudaVariant(VariantID vid, size_t tid) +void POLYBENCH_MVT::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_MVT-Hip.cpp b/src/polybench/POLYBENCH_MVT-Hip.cpp index 13ffe7ed3..40defe841 100644 --- a/src/polybench/POLYBENCH_MVT-Hip.cpp +++ b/src/polybench/POLYBENCH_MVT-Hip.cpp @@ -176,13 +176,13 @@ void POLYBENCH_MVT::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_MVT::runHipVariant(VariantID vid, size_t tid) +void POLYBENCH_MVT::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/polybench/POLYBENCH_MVT-OMP.cpp b/src/polybench/POLYBENCH_MVT-OMP.cpp index 4e59c9d0e..7d1b0454d 100644 --- a/src/polybench/POLYBENCH_MVT-OMP.cpp +++ b/src/polybench/POLYBENCH_MVT-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_MVT::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_MVT::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_MVT-OMPTarget.cpp b/src/polybench/POLYBENCH_MVT-OMPTarget.cpp index b748f2054..4487b0d52 100644 --- a/src/polybench/POLYBENCH_MVT-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_MVT-OMPTarget.cpp @@ -47,7 +47,7 @@ namespace polybench deallocOpenMPDeviceData(A, did); -void POLYBENCH_MVT::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_MVT::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_MVT-Seq.cpp b/src/polybench/POLYBENCH_MVT-Seq.cpp index 5e21371da..3283dfa37 100644 --- a/src/polybench/POLYBENCH_MVT-Seq.cpp +++ b/src/polybench/POLYBENCH_MVT-Seq.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_MVT::runSeqVariant(VariantID vid, size_t /*tid*/) +void POLYBENCH_MVT::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp index e6ce2c07a..60a3aca47 100644 --- a/src/polybench/POLYBENCH_MVT.cpp +++ b/src/polybench/POLYBENCH_MVT.cpp @@ -68,7 +68,7 @@ POLYBENCH_MVT::~POLYBENCH_MVT() { } -void POLYBENCH_MVT::setUp(VariantID vid, size_t /*tid*/) +void POLYBENCH_MVT::setUp(VariantID vid, size_t /*tune_idx*/) { (void) vid; allocAndInitData(m_y1, m_N, vid); @@ -78,13 +78,13 @@ void POLYBENCH_MVT::setUp(VariantID vid, size_t /*tid*/) allocAndInitDataConst(m_x2, m_N, 0.0, vid); } -void POLYBENCH_MVT::updateChecksum(VariantID vid, size_t tid) +void POLYBENCH_MVT::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_x1, m_N, checksum_scale_factor ); - checksum[vid][tid] += calcChecksum(m_x2, m_N, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_x1, m_N, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_x2, m_N, checksum_scale_factor ); } -void POLYBENCH_MVT::tearDown(VariantID vid, size_t /*tid*/) +void POLYBENCH_MVT::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_x1); diff --git a/src/polybench/POLYBENCH_MVT.hpp b/src/polybench/POLYBENCH_MVT.hpp index 63146c6b1..8a397f8dd 100644 --- a/src/polybench/POLYBENCH_MVT.hpp +++ b/src/polybench/POLYBENCH_MVT.hpp @@ -103,15 +103,15 @@ class POLYBENCH_MVT : public KernelBase ~POLYBENCH_MVT(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/stream/ADD-Cuda.cpp b/src/stream/ADD-Cuda.cpp index b560ea26b..d32f4c904 100644 --- a/src/stream/ADD-Cuda.cpp +++ b/src/stream/ADD-Cuda.cpp @@ -111,13 +111,13 @@ void ADD::runCudaVariantImpl(VariantID vid) } } -void ADD::runCudaVariant(VariantID vid, size_t tid) +void ADD::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/stream/ADD-Hip.cpp b/src/stream/ADD-Hip.cpp index 17bdc12f4..28cf6d9c9 100644 --- a/src/stream/ADD-Hip.cpp +++ b/src/stream/ADD-Hip.cpp @@ -113,13 +113,13 @@ void ADD::runHipVariantImpl(VariantID vid) } } -void ADD::runHipVariant(VariantID vid, size_t tid) +void ADD::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/stream/ADD-OMP.cpp b/src/stream/ADD-OMP.cpp index 134ce8071..3a0bae912 100644 --- a/src/stream/ADD-OMP.cpp +++ b/src/stream/ADD-OMP.cpp @@ -18,7 +18,7 @@ namespace stream { -void ADD::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void ADD::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/stream/ADD-OMPTarget.cpp b/src/stream/ADD-OMPTarget.cpp index 3d2b79661..a97a2185a 100644 --- a/src/stream/ADD-OMPTarget.cpp +++ b/src/stream/ADD-OMPTarget.cpp @@ -40,7 +40,7 @@ namespace stream deallocOpenMPDeviceData(b, did); \ deallocOpenMPDeviceData(c, did); -void ADD::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void ADD::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/ADD-Seq.cpp b/src/stream/ADD-Seq.cpp index d7d6af573..5d7daf46b 100644 --- a/src/stream/ADD-Seq.cpp +++ b/src/stream/ADD-Seq.cpp @@ -18,7 +18,7 @@ namespace stream { -void ADD::runSeqVariant(VariantID vid, size_t /*tid*/) +void ADD::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp index 6263b6205..6f194964e 100644 --- a/src/stream/ADD.cpp +++ b/src/stream/ADD.cpp @@ -58,19 +58,19 @@ ADD::~ADD() { } -void ADD::setUp(VariantID vid, size_t /*tid*/) +void ADD::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitData(m_a, getActualProblemSize(), vid); allocAndInitData(m_b, getActualProblemSize(), vid); allocAndInitDataConst(m_c, getActualProblemSize(), 0.0, vid); } -void ADD::updateChecksum(VariantID vid, size_t tid) +void ADD::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_c, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_c, getActualProblemSize()); } -void ADD::tearDown(VariantID vid, size_t /*tid*/) +void ADD::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_a); diff --git a/src/stream/ADD.hpp b/src/stream/ADD.hpp index e6d995a82..3f64ff855 100644 --- a/src/stream/ADD.hpp +++ b/src/stream/ADD.hpp @@ -43,15 +43,15 @@ class ADD : public KernelBase ~ADD(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/stream/COPY-Cuda.cpp b/src/stream/COPY-Cuda.cpp index 953b29412..b4763b6aa 100644 --- a/src/stream/COPY-Cuda.cpp +++ b/src/stream/COPY-Cuda.cpp @@ -109,13 +109,13 @@ void COPY::runCudaVariantImpl(VariantID vid) } } -void COPY::runCudaVariant(VariantID vid, size_t tid) +void COPY::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/stream/COPY-Hip.cpp b/src/stream/COPY-Hip.cpp index 8769be2e7..4ea444f63 100644 --- a/src/stream/COPY-Hip.cpp +++ b/src/stream/COPY-Hip.cpp @@ -111,13 +111,13 @@ void COPY::runHipVariantImpl(VariantID vid) } } -void COPY::runHipVariant(VariantID vid, size_t tid) +void COPY::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/stream/COPY-OMP.cpp b/src/stream/COPY-OMP.cpp index 7fb77f5d3..ba9ea5b50 100644 --- a/src/stream/COPY-OMP.cpp +++ b/src/stream/COPY-OMP.cpp @@ -18,7 +18,7 @@ namespace stream { -void COPY::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void COPY::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/stream/COPY-OMPTarget.cpp b/src/stream/COPY-OMPTarget.cpp index 0568f15d5..b06760a58 100644 --- a/src/stream/COPY-OMPTarget.cpp +++ b/src/stream/COPY-OMPTarget.cpp @@ -39,7 +39,7 @@ namespace stream deallocOpenMPDeviceData(c, did); -void COPY::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void COPY::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/COPY-Seq.cpp b/src/stream/COPY-Seq.cpp index 0c54f925e..371921c03 100644 --- a/src/stream/COPY-Seq.cpp +++ b/src/stream/COPY-Seq.cpp @@ -18,7 +18,7 @@ namespace stream { -void COPY::runSeqVariant(VariantID vid, size_t /*tid*/) +void COPY::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp index 3b1765fc0..bef39fabe 100644 --- a/src/stream/COPY.cpp +++ b/src/stream/COPY.cpp @@ -58,18 +58,18 @@ COPY::~COPY() { } -void COPY::setUp(VariantID vid, size_t /*tid*/) +void COPY::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitData(m_a, getActualProblemSize(), vid); allocAndInitDataConst(m_c, getActualProblemSize(), 0.0, vid); } -void COPY::updateChecksum(VariantID vid, size_t tid) +void COPY::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_c, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_c, getActualProblemSize()); } -void COPY::tearDown(VariantID vid, size_t /*tid*/) +void COPY::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_a); diff --git a/src/stream/COPY.hpp b/src/stream/COPY.hpp index 25dde0e22..3e9f3569f 100644 --- a/src/stream/COPY.hpp +++ b/src/stream/COPY.hpp @@ -42,15 +42,15 @@ class COPY : public KernelBase ~COPY(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index 0bd93b6a0..431668bff 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -130,13 +130,13 @@ void DOT::runCudaVariantImpl(VariantID vid) } } -void DOT::runCudaVariant(VariantID vid, size_t tid) +void DOT::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index 1538168b3..f587099a5 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -132,13 +132,13 @@ void DOT::runHipVariantImpl(VariantID vid) } } -void DOT::runHipVariant(VariantID vid, size_t tid) +void DOT::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/stream/DOT-OMP.cpp b/src/stream/DOT-OMP.cpp index 43dc64b50..dee61554f 100644 --- a/src/stream/DOT-OMP.cpp +++ b/src/stream/DOT-OMP.cpp @@ -18,7 +18,7 @@ namespace stream { -void DOT::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void DOT::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/stream/DOT-OMPTarget.cpp b/src/stream/DOT-OMPTarget.cpp index 523c08542..f579e1901 100644 --- a/src/stream/DOT-OMPTarget.cpp +++ b/src/stream/DOT-OMPTarget.cpp @@ -37,7 +37,7 @@ namespace stream deallocOpenMPDeviceData(a, did); \ deallocOpenMPDeviceData(b, did); -void DOT::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void DOT::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/DOT-Seq.cpp b/src/stream/DOT-Seq.cpp index b181ebb8f..7c84bcbe9 100644 --- a/src/stream/DOT-Seq.cpp +++ b/src/stream/DOT-Seq.cpp @@ -18,7 +18,7 @@ namespace stream { -void DOT::runSeqVariant(VariantID vid, size_t /*tid*/) +void DOT::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp index c9ba2a4fd..4534463a5 100644 --- a/src/stream/DOT.cpp +++ b/src/stream/DOT.cpp @@ -58,7 +58,7 @@ DOT::~DOT() { } -void DOT::setUp(VariantID vid, size_t /*tid*/) +void DOT::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitData(m_a, getActualProblemSize(), vid); allocAndInitData(m_b, getActualProblemSize(), vid); @@ -67,12 +67,12 @@ void DOT::setUp(VariantID vid, size_t /*tid*/) m_dot_init = 0.0; } -void DOT::updateChecksum(VariantID vid, size_t tid) +void DOT::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += m_dot; + checksum[vid][tune_idx] += m_dot; } -void DOT::tearDown(VariantID vid, size_t /*tid*/) +void DOT::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_a); diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp index f011019f8..383d1e07e 100644 --- a/src/stream/DOT.hpp +++ b/src/stream/DOT.hpp @@ -42,15 +42,15 @@ class DOT : public KernelBase ~DOT(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/stream/MUL-Cuda.cpp b/src/stream/MUL-Cuda.cpp index 7b2c85adc..2009c5b99 100644 --- a/src/stream/MUL-Cuda.cpp +++ b/src/stream/MUL-Cuda.cpp @@ -109,13 +109,13 @@ void MUL::runCudaVariantImpl(VariantID vid) } } -void MUL::runCudaVariant(VariantID vid, size_t tid) +void MUL::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/stream/MUL-Hip.cpp b/src/stream/MUL-Hip.cpp index f98cbb8ff..9d0dd6e1f 100644 --- a/src/stream/MUL-Hip.cpp +++ b/src/stream/MUL-Hip.cpp @@ -111,13 +111,13 @@ void MUL::runHipVariantImpl(VariantID vid) } } -void MUL::runHipVariant(VariantID vid, size_t tid) +void MUL::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/stream/MUL-OMP.cpp b/src/stream/MUL-OMP.cpp index 7c205f25e..a82a1bf64 100644 --- a/src/stream/MUL-OMP.cpp +++ b/src/stream/MUL-OMP.cpp @@ -18,7 +18,7 @@ namespace stream { -void MUL::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void MUL::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/stream/MUL-OMPTarget.cpp b/src/stream/MUL-OMPTarget.cpp index 2b6d49da3..55f4d7f8b 100644 --- a/src/stream/MUL-OMPTarget.cpp +++ b/src/stream/MUL-OMPTarget.cpp @@ -38,7 +38,7 @@ namespace stream deallocOpenMPDeviceData(b, did); \ deallocOpenMPDeviceData(c, did); -void MUL::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void MUL::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/MUL-Seq.cpp b/src/stream/MUL-Seq.cpp index ce0256f8e..8e7569a76 100644 --- a/src/stream/MUL-Seq.cpp +++ b/src/stream/MUL-Seq.cpp @@ -18,7 +18,7 @@ namespace stream { -void MUL::runSeqVariant(VariantID vid, size_t /*tid*/) +void MUL::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp index 1c2546183..eb3d917be 100644 --- a/src/stream/MUL.cpp +++ b/src/stream/MUL.cpp @@ -58,19 +58,19 @@ MUL::~MUL() { } -void MUL::setUp(VariantID vid, size_t /*tid*/) +void MUL::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_b, getActualProblemSize(), 0.0, vid); allocAndInitData(m_c, getActualProblemSize(), vid); initData(m_alpha, vid); } -void MUL::updateChecksum(VariantID vid, size_t tid) +void MUL::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_b, getActualProblemSize()); + checksum[vid][tune_idx] += calcChecksum(m_b, getActualProblemSize()); } -void MUL::tearDown(VariantID vid, size_t /*tid*/) +void MUL::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_b); diff --git a/src/stream/MUL.hpp b/src/stream/MUL.hpp index f95966640..5cb9075a6 100644 --- a/src/stream/MUL.hpp +++ b/src/stream/MUL.hpp @@ -43,15 +43,15 @@ class MUL : public KernelBase ~MUL(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/stream/TRIAD-Cuda.cpp b/src/stream/TRIAD-Cuda.cpp index ded410ce6..686749df6 100644 --- a/src/stream/TRIAD-Cuda.cpp +++ b/src/stream/TRIAD-Cuda.cpp @@ -111,13 +111,13 @@ void TRIAD::runCudaVariantImpl(VariantID vid) } } -void TRIAD::runCudaVariant(VariantID vid, size_t tid) +void TRIAD::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runCudaVariantImpl(vid); } t += 1; diff --git a/src/stream/TRIAD-Hip.cpp b/src/stream/TRIAD-Hip.cpp index 1126f0825..69a9880e6 100644 --- a/src/stream/TRIAD-Hip.cpp +++ b/src/stream/TRIAD-Hip.cpp @@ -113,13 +113,13 @@ void TRIAD::runHipVariantImpl(VariantID vid) } } -void TRIAD::runHipVariant(VariantID vid, size_t tid) +void TRIAD::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tid == t) { + if (tune_idx == t) { runHipVariantImpl(vid); } t += 1; diff --git a/src/stream/TRIAD-OMP.cpp b/src/stream/TRIAD-OMP.cpp index a4fe58409..ca86faf6b 100644 --- a/src/stream/TRIAD-OMP.cpp +++ b/src/stream/TRIAD-OMP.cpp @@ -18,7 +18,7 @@ namespace stream { -void TRIAD::runOpenMPVariant(VariantID vid, size_t /*tid*/) +void TRIAD::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/stream/TRIAD-OMPTarget.cpp b/src/stream/TRIAD-OMPTarget.cpp index 6eb852790..d38adede3 100644 --- a/src/stream/TRIAD-OMPTarget.cpp +++ b/src/stream/TRIAD-OMPTarget.cpp @@ -40,7 +40,7 @@ namespace stream deallocOpenMPDeviceData(b, did); \ deallocOpenMPDeviceData(c, did); -void TRIAD::runOpenMPTargetVariant(VariantID vid, size_t /*tid*/) +void TRIAD::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/TRIAD-Seq.cpp b/src/stream/TRIAD-Seq.cpp index 67083d4a4..e2d759959 100644 --- a/src/stream/TRIAD-Seq.cpp +++ b/src/stream/TRIAD-Seq.cpp @@ -18,7 +18,7 @@ namespace stream { -void TRIAD::runSeqVariant(VariantID vid, size_t /*tid*/) +void TRIAD::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp index dd8744440..c0b143872 100644 --- a/src/stream/TRIAD.cpp +++ b/src/stream/TRIAD.cpp @@ -62,7 +62,7 @@ TRIAD::~TRIAD() { } -void TRIAD::setUp(VariantID vid, size_t /*tid*/) +void TRIAD::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataConst(m_a, getActualProblemSize(), 0.0, vid); allocAndInitData(m_b, getActualProblemSize(), vid); @@ -70,12 +70,12 @@ void TRIAD::setUp(VariantID vid, size_t /*tid*/) initData(m_alpha, vid); } -void TRIAD::updateChecksum(VariantID vid, size_t tid) +void TRIAD::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tid] += calcChecksum(m_a, getActualProblemSize(), checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_a, getActualProblemSize(), checksum_scale_factor ); } -void TRIAD::tearDown(VariantID vid, size_t /*tid*/) +void TRIAD::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_a); diff --git a/src/stream/TRIAD.hpp b/src/stream/TRIAD.hpp index 7df0d864d..6a067f708 100644 --- a/src/stream/TRIAD.hpp +++ b/src/stream/TRIAD.hpp @@ -44,15 +44,15 @@ class TRIAD : public KernelBase ~TRIAD(); - void setUp(VariantID vid, size_t tid); - void updateChecksum(VariantID vid, size_t tid); - void tearDown(VariantID vid, size_t tid); - - void runSeqVariant(VariantID vid, size_t tid); - void runOpenMPVariant(VariantID vid, size_t tid); - void runCudaVariant(VariantID vid, size_t tid); - void runHipVariant(VariantID vid, size_t tid); - void runOpenMPTargetVariant(VariantID vid, size_t tid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); From 03a00337211092140e566e8f95bf8e173f2d0dd2 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 17 Mar 2022 16:54:34 -0700 Subject: [PATCH 277/392] Fix warnings in MAT_MAT_SHARED --- src/basic/MAT_MAT_SHARED-Cuda.cpp | 4 ++-- src/basic/MAT_MAT_SHARED-Hip.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/basic/MAT_MAT_SHARED-Cuda.cpp b/src/basic/MAT_MAT_SHARED-Cuda.cpp index 84fdb9a76..b20ab61f0 100644 --- a/src/basic/MAT_MAT_SHARED-Cuda.cpp +++ b/src/basic/MAT_MAT_SHARED-Cuda.cpp @@ -33,7 +33,7 @@ namespace basic { deallocCudaDeviceData(B); \ deallocCudaDeviceData(C); -template < size_t tile_size > +template < Index_type tile_size > __launch_bounds__(tile_size*tile_size) __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A, Real_ptr B) { @@ -64,7 +64,7 @@ __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A, template < size_t block_size > void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) { - constexpr size_t tile_size = gpu_block_size::sqrt(block_size); + constexpr Index_type tile_size = gpu_block_size::sqrt(block_size); static_assert(tile_size*tile_size == block_size, "Invalid block_size"); const Index_type run_reps = getRunReps(); diff --git a/src/basic/MAT_MAT_SHARED-Hip.cpp b/src/basic/MAT_MAT_SHARED-Hip.cpp index efddcab60..2054efde0 100644 --- a/src/basic/MAT_MAT_SHARED-Hip.cpp +++ b/src/basic/MAT_MAT_SHARED-Hip.cpp @@ -33,7 +33,7 @@ namespace basic { deallocHipDeviceData(B); \ deallocHipDeviceData(C); -template < size_t tile_size > +template < Index_type tile_size > __launch_bounds__(tile_size*tile_size) __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A, Real_ptr B) { @@ -64,7 +64,7 @@ __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A, template < size_t block_size > void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) { - constexpr size_t tile_size = gpu_block_size::sqrt(block_size); + constexpr Index_type tile_size = gpu_block_size::sqrt(block_size); static_assert(tile_size*tile_size == block_size, "Invalid block_size"); const Index_type run_reps = getRunReps(); From ce782b1eaf3d648a89ba7a4b9d58bc4c36520622 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 17 Mar 2022 16:55:26 -0700 Subject: [PATCH 278/392] remove unsed variable --- scripts/lc-builds/blueos_nvcc_gcc.sh | 2 +- src/apps/DIFFUSION3DPA-Cuda.cpp | 4 ++-- src/apps/DIFFUSION3DPA-Hip.cpp | 4 ++-- src/apps/MASS3DPA-Cuda.cpp | 4 ++-- src/apps/MASS3DPA-Hip.cpp | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/lc-builds/blueos_nvcc_gcc.sh b/scripts/lc-builds/blueos_nvcc_gcc.sh index f29df3506..46f64ee17 100755 --- a/scripts/lc-builds/blueos_nvcc_gcc.sh +++ b/scripts/lc-builds/blueos_nvcc_gcc.sh @@ -41,7 +41,7 @@ module load cmake/3.14.5 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_GCC_VER}/bin/g++ \ - -DBLT_CXX_STD=c++11 \ + -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DENABLE_CUDA=On \ diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index 35722eae0..09d275c9c 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -37,7 +37,7 @@ namespace apps { deallocCudaDeviceData(X); \ deallocCudaDeviceData(Y); -__global__ void Diffusion3DPA(Index_type NE, const Real_ptr Basis, +__global__ void Diffusion3DPA(const Real_ptr Basis, const Real_ptr dBasis, const Real_ptr D, const Real_ptr X, Real_ptr Y, bool symmetric) { @@ -134,7 +134,7 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, DPA_Q1D); - Diffusion3DPA<<>>(NE, Basis, dBasis, D, X, Y, + Diffusion3DPA<<>>(Basis, dBasis, D, X, Y, symmetric); cudaErrchk(cudaGetLastError()); diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index 44c18a542..ecb094b04 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -37,7 +37,7 @@ namespace apps { deallocHipDeviceData(X); \ deallocHipDeviceData(Y); -__global__ void Diffusion3DPA(Index_type NE, const Real_ptr Basis, +__global__ void Diffusion3DPA(const Real_ptr Basis, const Real_ptr dBasis, const Real_ptr D, const Real_ptr X, Real_ptr Y, bool symmetric) { @@ -136,7 +136,7 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { for (RepIndex_type irep = 0; irep < run_reps; ++irep) { hipLaunchKernelGGL((Diffusion3DPA), dim3(grid_size), dim3(block_size), 0, - 0, NE, Basis, dBasis, D, X, Y, symmetric); + 0, Basis, dBasis, D, X, Y, symmetric); hipErrchk(hipGetLastError()); } diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp index 288886be2..748f9c9e3 100644 --- a/src/apps/MASS3DPA-Cuda.cpp +++ b/src/apps/MASS3DPA-Cuda.cpp @@ -37,7 +37,7 @@ namespace apps { deallocCudaDeviceData(X); \ deallocCudaDeviceData(Y); -__global__ void Mass3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt, +__global__ void Mass3DPA(const Real_ptr B, const Real_ptr Bt, const Real_ptr D, const Real_ptr X, Real_ptr Y) { const int e = blockIdx.x; @@ -116,7 +116,7 @@ void MASS3DPA::runCudaVariant(VariantID vid) { dim3 nthreads_per_block(MPA_Q1D, MPA_Q1D, 1); - Mass3DPA<<>>(NE, B, Bt, D, X, Y); + Mass3DPA<<>>(B, Bt, D, X, Y); cudaErrchk( cudaGetLastError() ); } diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index 3a545da0c..2a3e95c12 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -37,7 +37,7 @@ namespace apps { deallocHipDeviceData(X); \ deallocHipDeviceData(Y); -__global__ void Mass3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt, +__global__ void Mass3DPA(const Real_ptr B, const Real_ptr Bt, const Real_ptr D, const Real_ptr X, Real_ptr Y) { const int e = hipBlockIdx_x; @@ -118,7 +118,7 @@ void MASS3DPA::runHipVariant(VariantID vid) { for (RepIndex_type irep = 0; irep < run_reps; ++irep) { hipLaunchKernelGGL((Mass3DPA), dim3(grid_size), dim3(block_size), 0, 0, - NE, B, Bt, D, X, Y); + B, Bt, D, X, Y); hipErrchk( hipGetLastError() ); From 6a70a6525d7f2083761528d8dc5c2a65af962142 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 17 Mar 2022 17:04:20 -0700 Subject: [PATCH 279/392] Fix unused var varning in MASS3DPA --- src/apps/MASS3DPA-Cuda.cpp | 2 +- src/apps/MASS3DPA-Hip.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp index 69069d780..fa4aa57e5 100644 --- a/src/apps/MASS3DPA-Cuda.cpp +++ b/src/apps/MASS3DPA-Cuda.cpp @@ -102,7 +102,7 @@ __global__ void Mass3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt, } } -void MASS3DPA::runCudaVariant(VariantID vid, size_t tune_idx) { +void MASS3DPA::runCudaVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); MASS3DPA_DATA_SETUP; diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index 09ce997ec..ed2e04c65 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -102,7 +102,7 @@ __global__ void Mass3DPA(Index_type NE, const Real_ptr B, const Real_ptr Bt, } } -void MASS3DPA::runHipVariant(VariantID vid, size_t tune_idx) { +void MASS3DPA::runHipVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); MASS3DPA_DATA_SETUP; From c3127935dd7f155cf354ad639a93be66a36483c0 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Fri, 18 Mar 2022 10:42:24 -0700 Subject: [PATCH 280/392] update to single policies with team kernels --- src/apps/DIFFUSION3DPA-Cuda.cpp | 12 ++++----- src/apps/DIFFUSION3DPA-Hip.cpp | 12 ++++----- src/apps/DIFFUSION3DPA-OMP.cpp | 37 +++++-------------------- src/apps/DIFFUSION3DPA-Seq.cpp | 45 +++++++------------------------ src/apps/DIFFUSION3DPA.hpp | 16 ----------- src/apps/MASS3DPA-Cuda.cpp | 17 +++--------- src/apps/MASS3DPA-Hip.cpp | 17 +++--------- src/apps/MASS3DPA-OMP.cpp | 26 ++++-------------- src/apps/MASS3DPA-Seq.cpp | 32 ++++++---------------- src/apps/MASS3DPA.hpp | 14 ---------- src/basic/MAT_MAT_SHARED-Cuda.cpp | 22 ++++----------- src/basic/MAT_MAT_SHARED-Hip.cpp | 21 ++++----------- src/basic/MAT_MAT_SHARED-OMP.cpp | 32 +++++----------------- src/basic/MAT_MAT_SHARED-Seq.cpp | 41 +++++++--------------------- src/basic/MAT_MAT_SHARED.hpp | 16 ----------- 15 files changed, 73 insertions(+), 287 deletions(-) diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index 09d275c9c..00fa54e22 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -151,26 +151,24 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { DIFFUSION3DPA_DATA_SETUP_CUDA; using launch_policy = - RAJA::expt::LaunchPolicy>; + RAJA::expt::LaunchPolicy>; using outer_x = - RAJA::expt::LoopPolicy; + RAJA::expt::LoopPolicy; using inner_x = - RAJA::expt::LoopPolicy; + RAJA::expt::LoopPolicy; using inner_y = - RAJA::expt::LoopPolicy; + RAJA::expt::LoopPolicy; using inner_z = - RAJA::expt::LoopPolicy; + RAJA::expt::LoopPolicy; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { RAJA::expt::launch( - RAJA::expt::DEVICE, RAJA::expt::Grid(RAJA::expt::Teams(NE), RAJA::expt::Threads(DPA_Q1D, DPA_Q1D, DPA_Q1D)), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index ecb094b04..228f91c3e 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -152,26 +152,24 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { DIFFUSION3DPA_DATA_SETUP_HIP; using launch_policy = - RAJA::expt::LaunchPolicy>; + RAJA::expt::LaunchPolicy>; using outer_x = - RAJA::expt::LoopPolicy; + RAJA::expt::LoopPolicy; using inner_x = - RAJA::expt::LoopPolicy; + RAJA::expt::LoopPolicy; using inner_y = - RAJA::expt::LoopPolicy; + RAJA::expt::LoopPolicy; using inner_z = - RAJA::expt::LoopPolicy; + RAJA::expt::LoopPolicy; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { RAJA::expt::launch( - RAJA::expt::DEVICE, RAJA::expt::Grid(RAJA::expt::Teams(NE), RAJA::expt::Threads(DPA_Q1D, DPA_Q1D, DPA_Q1D)), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { diff --git a/src/apps/DIFFUSION3DPA-OMP.cpp b/src/apps/DIFFUSION3DPA-OMP.cpp index bd8ebe368..8c62ce4e8 100644 --- a/src/apps/DIFFUSION3DPA-OMP.cpp +++ b/src/apps/DIFFUSION3DPA-OMP.cpp @@ -115,47 +115,22 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { case RAJA_OpenMP: { // Currently Teams requires two policies if compiled with a device - using launch_policy = RAJA::expt::LaunchPolicy; + using launch_policy = RAJA::expt::LaunchPolicy; - using outer_x = RAJA::expt::LoopPolicy; + using outer_x = RAJA::expt::LoopPolicy; - using inner_x = RAJA::expt::LoopPolicy; + using inner_x = RAJA::expt::LoopPolicy; - using inner_y = RAJA::expt::LoopPolicy; + using inner_y = RAJA::expt::LoopPolicy; - using inner_z = RAJA::expt::LoopPolicy; + using inner_z = RAJA::expt::LoopPolicy; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { // Grid is empty as the host does not need a compute grid to be specified RAJA::expt::launch( - RAJA::expt::HOST, RAJA::expt::Grid(), + RAJA::expt::Grid(), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index b49fa7e2b..4f3310867 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -113,47 +113,22 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { case RAJA_Seq: { // Currently Teams requires two policies if compiled with a device - using launch_policy = RAJA::expt::LaunchPolicy; - - using outer_x = RAJA::expt::LoopPolicy; - - using inner_x = RAJA::expt::LoopPolicy; - - using inner_y = RAJA::expt::LoopPolicy; - - using inner_z = RAJA::expt::LoopPolicy; + using launch_policy = RAJA::expt::LaunchPolicy; + + using outer_x = RAJA::expt::LoopPolicy; + + using inner_x = RAJA::expt::LoopPolicy; + + using inner_y = RAJA::expt::LoopPolicy; + + using inner_z = RAJA::expt::LoopPolicy; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { // Grid is empty as the host does not need a compute grid to be specified RAJA::expt::launch( - RAJA::expt::HOST, RAJA::expt::Grid(), + RAJA::expt::Grid(), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index bf73769da..09c356241 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -457,22 +457,6 @@ static RAJA_HOST_DEVICE inline double sign(const int q, const int d) } \ dpaY_(dx,dy,dz,e) += (u + v + w); -#if defined(RAJA_ENABLE_CUDA) - using d3d_device_launch = RAJA::expt::cuda_launch_t; - using d3d_gpu_block_x_policy = RAJA::cuda_block_x_direct; - using d3d_gpu_thread_x_policy = RAJA::cuda_thread_x_loop; - using d3d_gpu_thread_y_policy = RAJA::cuda_thread_y_loop; - using d3d_gpu_thread_z_policy = RAJA::cuda_thread_z_loop; -#endif - -#if defined(RAJA_ENABLE_HIP) - using d3d_device_launch = RAJA::expt::hip_launch_t; - using d3d_gpu_block_x_policy = RAJA::hip_block_x_direct; - using d3d_gpu_thread_x_policy = RAJA::hip_thread_x_loop; - using d3d_gpu_thread_y_policy = RAJA::hip_thread_y_loop; - using d3d_gpu_thread_z_policy = RAJA::hip_thread_z_loop; -#endif - namespace rajaperf { class RunParams; diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp index 748f9c9e3..95c4f682a 100644 --- a/src/apps/MASS3DPA-Cuda.cpp +++ b/src/apps/MASS3DPA-Cuda.cpp @@ -131,27 +131,18 @@ void MASS3DPA::runCudaVariant(VariantID vid) { MASS3DPA_DATA_SETUP_CUDA; - using launch_policy = RAJA::expt::LaunchPolicy - >; + using launch_policy = RAJA::expt::LaunchPolicy>; - using outer_x = RAJA::expt::LoopPolicy; + using outer_x = RAJA::expt::LoopPolicy; - using inner_x = RAJA::expt::LoopPolicy; + using inner_x = RAJA::expt::LoopPolicy; - using inner_y = RAJA::expt::LoopPolicy; + using inner_y = RAJA::expt::LoopPolicy; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { RAJA::expt::launch( - RAJA::expt::DEVICE, RAJA::expt::Grid(RAJA::expt::Teams(NE), RAJA::expt::Threads(MPA_Q1D, MPA_Q1D, 1)), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index 2a3e95c12..0a3bb4373 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -134,27 +134,18 @@ void MASS3DPA::runHipVariant(VariantID vid) { MASS3DPA_DATA_SETUP_HIP; - using launch_policy = RAJA::expt::LaunchPolicy - >; + using launch_policy = RAJA::expt::LaunchPolicy>; - using outer_x = RAJA::expt::LoopPolicy; + using outer_x = RAJA::expt::LoopPolicy; - using inner_x = RAJA::expt::LoopPolicy; + using inner_x = RAJA::expt::LoopPolicy; - using inner_y = RAJA::expt::LoopPolicy; + using inner_y = RAJA::expt::LoopPolicy; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { RAJA::expt::launch( - RAJA::expt::DEVICE, RAJA::expt::Grid(RAJA::expt::Teams(NE), RAJA::expt::Threads(MPA_Q1D, MPA_Q1D, 1)), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { diff --git a/src/apps/MASS3DPA-OMP.cpp b/src/apps/MASS3DPA-OMP.cpp index ca6136272..00a2085a4 100644 --- a/src/apps/MASS3DPA-OMP.cpp +++ b/src/apps/MASS3DPA-OMP.cpp @@ -99,36 +99,20 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { case RAJA_OpenMP: { //Currently Teams requires two policies if compiled with a device - using launch_policy = RAJA::expt::LaunchPolicy; + using launch_policy = RAJA::expt::LaunchPolicy; - using outer_x = RAJA::expt::LoopPolicy; + using outer_x = RAJA::expt::LoopPolicy; - using inner_x = RAJA::expt::LoopPolicy; + using inner_x = RAJA::expt::LoopPolicy; - using inner_y = RAJA::expt::LoopPolicy; + using inner_y = RAJA::expt::LoopPolicy; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { //Grid is empty as the host does not need a compute grid to be specified RAJA::expt::launch( - RAJA::expt::HOST, RAJA::expt::Grid(), + RAJA::expt::Grid(), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), diff --git a/src/apps/MASS3DPA-Seq.cpp b/src/apps/MASS3DPA-Seq.cpp index 47177bb5f..b4e3238d3 100644 --- a/src/apps/MASS3DPA-Seq.cpp +++ b/src/apps/MASS3DPA-Seq.cpp @@ -97,35 +97,19 @@ void MASS3DPA::runSeqVariant(VariantID vid) { case RAJA_Seq: { //Currently Teams requires two policies if compiled with a device - using launch_policy = RAJA::expt::LaunchPolicy; - - using outer_x = RAJA::expt::LoopPolicy; - - using inner_x = RAJA::expt::LoopPolicy; - - using inner_y = RAJA::expt::LoopPolicy; + using launch_policy = RAJA::expt::LaunchPolicy; + + using outer_x = RAJA::expt::LoopPolicy; + + using inner_x = RAJA::expt::LoopPolicy; + + using inner_y = RAJA::expt::LoopPolicy; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { RAJA::expt::launch( - RAJA::expt::HOST, RAJA::expt::Grid(), + RAJA::expt::Grid(), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NE), diff --git a/src/apps/MASS3DPA.hpp b/src/apps/MASS3DPA.hpp index be0df8a3e..a374eb9b7 100644 --- a/src/apps/MASS3DPA.hpp +++ b/src/apps/MASS3DPA.hpp @@ -339,20 +339,6 @@ for (int qz = 0; qz < MPA_Q1D; ++qz) { \ } -#if defined(RAJA_ENABLE_CUDA) - using m3d_device_launch = RAJA::expt::cuda_launch_t; - using m3d_gpu_block_x_policy = RAJA::cuda_block_x_direct; - using m3d_gpu_thread_x_policy = RAJA::cuda_thread_x_loop; - using m3d_gpu_thread_y_policy = RAJA::cuda_thread_y_loop; -#endif - -#if defined(RAJA_ENABLE_HIP) - using m3d_device_launch = RAJA::expt::hip_launch_t; - using m3d_gpu_block_x_policy = RAJA::hip_block_x_direct; - using m3d_gpu_thread_x_policy = RAJA::hip_thread_x_loop; - using m3d_gpu_thread_y_policy = RAJA::hip_thread_y_loop; -#endif - namespace rajaperf { class RunParams; diff --git a/src/basic/MAT_MAT_SHARED-Cuda.cpp b/src/basic/MAT_MAT_SHARED-Cuda.cpp index d104b0a72..a9a755f75 100644 --- a/src/basic/MAT_MAT_SHARED-Cuda.cpp +++ b/src/basic/MAT_MAT_SHARED-Cuda.cpp @@ -194,32 +194,20 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) { MAT_MAT_SHARED_DATA_SETUP_CUDA; - using launch_policy = RAJA::expt::LaunchPolicy - >; + using launch_policy = RAJA::expt::LaunchPolicy>; - using teams_x = RAJA::expt::LoopPolicy; + using teams_x = RAJA::expt::LoopPolicy; - using teams_y = RAJA::expt::LoopPolicy; + using teams_y = RAJA::expt::LoopPolicy; - using threads_x = RAJA::expt::LoopPolicy; - - using threads_y = RAJA::expt::LoopPolicy; + using threads_x = RAJA::expt::LoopPolicy; + using threads_y = RAJA::expt::LoopPolicy; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { RAJA::expt::launch( - RAJA::expt::DEVICE, RAJA::expt::Grid(RAJA::expt::Teams(Nx, Ny), RAJA::expt::Threads(TL_SZ, TL_SZ)), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { diff --git a/src/basic/MAT_MAT_SHARED-Hip.cpp b/src/basic/MAT_MAT_SHARED-Hip.cpp index dc5667597..45620f8f0 100644 --- a/src/basic/MAT_MAT_SHARED-Hip.cpp +++ b/src/basic/MAT_MAT_SHARED-Hip.cpp @@ -199,31 +199,20 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) { MAT_MAT_SHARED_DATA_SETUP_HIP; - using launch_policy = RAJA::expt::LaunchPolicy - >; + using launch_policy = RAJA::expt::LaunchPolicy>; - using teams_x = RAJA::expt::LoopPolicy; + using teams_x = RAJA::expt::LoopPolicy; - using teams_y = RAJA::expt::LoopPolicy; + using teams_y = RAJA::expt::LoopPolicy; - using threads_x = RAJA::expt::LoopPolicy; + using threads_x = RAJA::expt::LoopPolicy; - using threads_y = RAJA::expt::LoopPolicy; + using threads_y = RAJA::expt::LoopPolicy; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { RAJA::expt::launch( - RAJA::expt::DEVICE, RAJA::expt::Grid(RAJA::expt::Teams(Nx, Ny), RAJA::expt::Threads(TL_SZ, TL_SZ)), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { diff --git a/src/basic/MAT_MAT_SHARED-OMP.cpp b/src/basic/MAT_MAT_SHARED-OMP.cpp index e745a5a5e..09a5501c9 100644 --- a/src/basic/MAT_MAT_SHARED-OMP.cpp +++ b/src/basic/MAT_MAT_SHARED-OMP.cpp @@ -159,41 +159,21 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid) { case RAJA_OpenMP: { //Currently Teams requires two policies if compiled with a device - using launch_policy = RAJA::expt::LaunchPolicy; + using launch_policy = RAJA::expt::LaunchPolicy; - using outer_x = RAJA::expt::LoopPolicy; + using outer_x = RAJA::expt::LoopPolicy; - using outer_y = RAJA::expt::LoopPolicy; + using outer_y = RAJA::expt::LoopPolicy; - using inner_x = RAJA::expt::LoopPolicy; + using inner_x = RAJA::expt::LoopPolicy; - using inner_y = RAJA::expt::LoopPolicy; + using inner_y = RAJA::expt::LoopPolicy; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { //Grid is empty as the host does not need a compute grid to be specified - RAJA::expt::launch(RAJA::expt::HOST, RAJA::expt::Grid(), + RAJA::expt::launch(RAJA::expt::Grid(), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Ny), diff --git a/src/basic/MAT_MAT_SHARED-Seq.cpp b/src/basic/MAT_MAT_SHARED-Seq.cpp index b15b4f018..89cafd6dd 100644 --- a/src/basic/MAT_MAT_SHARED-Seq.cpp +++ b/src/basic/MAT_MAT_SHARED-Seq.cpp @@ -155,42 +155,21 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid) { case RAJA_Seq: { - //Currently Teams requires two policies if compiled with a device - using launch_policy = RAJA::expt::LaunchPolicy; - - using outer_x = RAJA::expt::LoopPolicy; - - using outer_y = RAJA::expt::LoopPolicy; - - using inner_x = RAJA::expt::LoopPolicy; - - using inner_y = RAJA::expt::LoopPolicy; + using launch_policy = RAJA::expt::LaunchPolicy; + + using outer_x = RAJA::expt::LoopPolicy; + + using outer_y = RAJA::expt::LoopPolicy; + + using inner_x = RAJA::expt::LoopPolicy; + + using inner_y = RAJA::expt::LoopPolicy; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { //Grid is empty as the host does not need a compute grid to be specified - RAJA::expt::launch(RAJA::expt::HOST, RAJA::expt::Grid(), + RAJA::expt::launch(RAJA::expt::Grid(), [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Ny), diff --git a/src/basic/MAT_MAT_SHARED.hpp b/src/basic/MAT_MAT_SHARED.hpp index 95b799eb8..71cbbf7b1 100644 --- a/src/basic/MAT_MAT_SHARED.hpp +++ b/src/basic/MAT_MAT_SHARED.hpp @@ -118,22 +118,6 @@ constexpr rajaperf::Index_type TL_SZ = 16; if (Row < N && Col < N) \ C[Col + N * Row] = Cs[ty][tx]; -#if defined(RAJA_ENABLE_CUDA) - using mms_device_launch = RAJA::expt::cuda_launch_t; - using mms_gpu_block_x_policy = RAJA::cuda_block_x_direct; - using mms_gpu_block_y_policy = RAJA::cuda_block_y_direct; - using mms_gpu_thread_x_policy = RAJA::cuda_thread_x_direct; - using mms_gpu_thread_y_policy = RAJA::cuda_thread_y_direct; -#endif - -#if defined(RAJA_ENABLE_HIP) - using mms_device_launch = RAJA::expt::hip_launch_t; - using mms_gpu_block_x_policy = RAJA::hip_block_x_direct; - using mms_gpu_block_y_policy = RAJA::hip_block_y_direct; - using mms_gpu_thread_x_policy = RAJA::hip_thread_x_direct; - using mms_gpu_thread_y_policy = RAJA::hip_thread_y_direct; -#endif - namespace rajaperf { class RunParams; From 9b888700d964b967cc2aba46c5ca09c06c811cb1 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Fri, 18 Mar 2022 11:43:23 -0700 Subject: [PATCH 281/392] add async variable for clarity --- src/apps/DIFFUSION3DPA-Cuda.cpp | 4 +++- src/apps/DIFFUSION3DPA-Hip.cpp | 4 +++- src/apps/MASS3DPA-Cuda.cpp | 4 +++- src/apps/MASS3DPA-Hip.cpp | 4 +++- src/basic/MAT_MAT_SHARED-Cuda.cpp | 4 +++- src/basic/MAT_MAT_SHARED-Hip.cpp | 4 +++- 6 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index 00fa54e22..1d0b99f03 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -150,8 +150,10 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid) { DIFFUSION3DPA_DATA_SETUP_CUDA; + constexpr bool async = true; + using launch_policy = - RAJA::expt::LaunchPolicy>; + RAJA::expt::LaunchPolicy>; using outer_x = RAJA::expt::LoopPolicy; diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index 228f91c3e..1d38b0b91 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -151,8 +151,10 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid) { DIFFUSION3DPA_DATA_SETUP_HIP; + constexpr bool async = true; + using launch_policy = - RAJA::expt::LaunchPolicy>; + RAJA::expt::LaunchPolicy>; using outer_x = RAJA::expt::LoopPolicy; diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp index 95c4f682a..8c33d7c17 100644 --- a/src/apps/MASS3DPA-Cuda.cpp +++ b/src/apps/MASS3DPA-Cuda.cpp @@ -131,7 +131,9 @@ void MASS3DPA::runCudaVariant(VariantID vid) { MASS3DPA_DATA_SETUP_CUDA; - using launch_policy = RAJA::expt::LaunchPolicy>; + constexpr bool async = true; + + using launch_policy = RAJA::expt::LaunchPolicy>; using outer_x = RAJA::expt::LoopPolicy; diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index 0a3bb4373..919a7e7b6 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -134,7 +134,9 @@ void MASS3DPA::runHipVariant(VariantID vid) { MASS3DPA_DATA_SETUP_HIP; - using launch_policy = RAJA::expt::LaunchPolicy>; + constexpr bool async = true; + + using launch_policy = RAJA::expt::LaunchPolicy>; using outer_x = RAJA::expt::LoopPolicy; diff --git a/src/basic/MAT_MAT_SHARED-Cuda.cpp b/src/basic/MAT_MAT_SHARED-Cuda.cpp index a9a755f75..75757edd3 100644 --- a/src/basic/MAT_MAT_SHARED-Cuda.cpp +++ b/src/basic/MAT_MAT_SHARED-Cuda.cpp @@ -194,7 +194,9 @@ void MAT_MAT_SHARED::runCudaVariant(VariantID vid) { MAT_MAT_SHARED_DATA_SETUP_CUDA; - using launch_policy = RAJA::expt::LaunchPolicy>; + constexpr bool async = true; + + using launch_policy = RAJA::expt::LaunchPolicy>; using teams_x = RAJA::expt::LoopPolicy; diff --git a/src/basic/MAT_MAT_SHARED-Hip.cpp b/src/basic/MAT_MAT_SHARED-Hip.cpp index 45620f8f0..f2e546672 100644 --- a/src/basic/MAT_MAT_SHARED-Hip.cpp +++ b/src/basic/MAT_MAT_SHARED-Hip.cpp @@ -199,7 +199,9 @@ void MAT_MAT_SHARED::runHipVariant(VariantID vid) { MAT_MAT_SHARED_DATA_SETUP_HIP; - using launch_policy = RAJA::expt::LaunchPolicy>; + constexpr bool async = true; + + using launch_policy = RAJA::expt::LaunchPolicy>; using teams_x = RAJA::expt::LoopPolicy; From f5d8e569f8e53a29483c53d8bc3bb41ece5279b9 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 29 Mar 2022 11:25:15 -0700 Subject: [PATCH 282/392] Use macro to de-duplicate boilerplate Implement gpu block size helper methods via a macro. --- src/apps/DEL_DOT_VEC_2D-Cuda.cpp | 24 +---------------- src/apps/DEL_DOT_VEC_2D-Hip.cpp | 24 +---------------- src/apps/ENERGY-Cuda.cpp | 24 +---------------- src/apps/ENERGY-Hip.cpp | 24 +---------------- src/apps/FIR-Cuda.cpp | 24 +---------------- src/apps/FIR-Hip.cpp | 24 +---------------- src/apps/HALOEXCHANGE-Cuda.cpp | 24 +---------------- src/apps/HALOEXCHANGE-Hip.cpp | 24 +---------------- src/apps/HALOEXCHANGE_FUSED-Cuda.cpp | 24 +---------------- src/apps/HALOEXCHANGE_FUSED-Hip.cpp | 24 +---------------- src/apps/LTIMES-Cuda.cpp | 24 +---------------- src/apps/LTIMES-Hip.cpp | 24 +---------------- src/apps/LTIMES_NOVIEW-Cuda.cpp | 24 +---------------- src/apps/LTIMES_NOVIEW-Hip.cpp | 24 +---------------- src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp | 24 +---------------- src/apps/NODAL_ACCUMULATION_3D-Hip.cpp | 24 +---------------- src/apps/PRESSURE-Cuda.cpp | 24 +---------------- src/apps/PRESSURE-Hip.cpp | 24 +---------------- src/apps/VOL3D-Cuda.cpp | 24 +---------------- src/apps/VOL3D-Hip.cpp | 24 +---------------- src/basic/DAXPY-Cuda.cpp | 24 +---------------- src/basic/DAXPY-Hip.cpp | 24 +---------------- src/basic/DAXPY_ATOMIC-Cuda.cpp | 24 +---------------- src/basic/DAXPY_ATOMIC-Hip.cpp | 24 +---------------- src/basic/IF_QUAD-Cuda.cpp | 24 +---------------- src/basic/IF_QUAD-Hip.cpp | 24 +---------------- src/basic/INIT3-Cuda.cpp | 24 +---------------- src/basic/INIT3-Hip.cpp | 24 +---------------- src/basic/INIT_VIEW1D-Cuda.cpp | 24 +---------------- src/basic/INIT_VIEW1D-Hip.cpp | 24 +---------------- src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp | 24 +---------------- src/basic/INIT_VIEW1D_OFFSET-Hip.cpp | 24 +---------------- src/basic/MAT_MAT_SHARED-Cuda.cpp | 24 +---------------- src/basic/MAT_MAT_SHARED-Hip.cpp | 24 +---------------- src/basic/MULADDSUB-Cuda.cpp | 24 +---------------- src/basic/MULADDSUB-Hip.cpp | 24 +---------------- src/basic/NESTED_INIT-Cuda.cpp | 24 +---------------- src/basic/NESTED_INIT-Hip.cpp | 24 +---------------- src/basic/PI_ATOMIC-Cuda.cpp | 24 +---------------- src/basic/PI_ATOMIC-Hip.cpp | 24 +---------------- src/basic/PI_REDUCE-Cuda.cpp | 24 +---------------- src/basic/PI_REDUCE-Hip.cpp | 24 +---------------- src/basic/REDUCE3_INT-Cuda.cpp | 24 +---------------- src/basic/REDUCE3_INT-Hip.cpp | 24 +---------------- src/basic/TRAP_INT-Cuda.cpp | 24 +---------------- src/basic/TRAP_INT-Hip.cpp | 24 +---------------- src/common/GPUUtils.hpp | 26 +++++++++++++++++++ src/lcals/DIFF_PREDICT-Cuda.cpp | 24 +---------------- src/lcals/DIFF_PREDICT-Hip.cpp | 24 +---------------- src/lcals/EOS-Cuda.cpp | 24 +---------------- src/lcals/EOS-Hip.cpp | 24 +---------------- src/lcals/FIRST_DIFF-Cuda.cpp | 24 +---------------- src/lcals/FIRST_DIFF-Hip.cpp | 24 +---------------- src/lcals/FIRST_MIN-Cuda.cpp | 24 +---------------- src/lcals/FIRST_MIN-Hip.cpp | 24 +---------------- src/lcals/FIRST_SUM-Cuda.cpp | 24 +---------------- src/lcals/FIRST_SUM-Hip.cpp | 24 +---------------- src/lcals/GEN_LIN_RECUR-Cuda.cpp | 24 +---------------- src/lcals/GEN_LIN_RECUR-Hip.cpp | 24 +---------------- src/lcals/HYDRO_1D-Cuda.cpp | 24 +---------------- src/lcals/HYDRO_1D-Hip.cpp | 24 +---------------- src/lcals/HYDRO_2D-Cuda.cpp | 24 +---------------- src/lcals/HYDRO_2D-Hip.cpp | 24 +---------------- src/lcals/INT_PREDICT-Cuda.cpp | 24 +---------------- src/lcals/INT_PREDICT-Hip.cpp | 24 +---------------- src/lcals/PLANCKIAN-Cuda.cpp | 24 +---------------- src/lcals/PLANCKIAN-Hip.cpp | 24 +---------------- src/lcals/TRIDIAG_ELIM-Cuda.cpp | 24 +---------------- src/lcals/TRIDIAG_ELIM-Hip.cpp | 24 +---------------- src/polybench/POLYBENCH_2MM-Cuda.cpp | 24 +---------------- src/polybench/POLYBENCH_2MM-Hip.cpp | 24 +---------------- src/polybench/POLYBENCH_3MM-Cuda.cpp | 24 +---------------- src/polybench/POLYBENCH_3MM-Hip.cpp | 24 +---------------- src/polybench/POLYBENCH_ADI-Cuda.cpp | 24 +---------------- src/polybench/POLYBENCH_ADI-Hip.cpp | 24 +---------------- src/polybench/POLYBENCH_ATAX-Cuda.cpp | 24 +---------------- src/polybench/POLYBENCH_ATAX-Hip.cpp | 24 +---------------- src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp | 24 +---------------- src/polybench/POLYBENCH_FDTD_2D-Hip.cpp | 24 +---------------- .../POLYBENCH_FLOYD_WARSHALL-Cuda.cpp | 24 +---------------- .../POLYBENCH_FLOYD_WARSHALL-Hip.cpp | 24 +---------------- src/polybench/POLYBENCH_GEMM-Cuda.cpp | 24 +---------------- src/polybench/POLYBENCH_GEMM-Hip.cpp | 24 +---------------- src/polybench/POLYBENCH_GEMVER-Cuda.cpp | 24 +---------------- src/polybench/POLYBENCH_GEMVER-Hip.cpp | 24 +---------------- src/polybench/POLYBENCH_GESUMMV-Cuda.cpp | 24 +---------------- src/polybench/POLYBENCH_GESUMMV-Hip.cpp | 24 +---------------- src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp | 24 +---------------- src/polybench/POLYBENCH_HEAT_3D-Hip.cpp | 24 +---------------- src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp | 24 +---------------- src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp | 24 +---------------- src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp | 24 +---------------- src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp | 24 +---------------- src/polybench/POLYBENCH_MVT-Cuda.cpp | 24 +---------------- src/polybench/POLYBENCH_MVT-Hip.cpp | 24 +---------------- src/stream/ADD-Cuda.cpp | 24 +---------------- src/stream/ADD-Hip.cpp | 24 +---------------- src/stream/COPY-Cuda.cpp | 24 +---------------- src/stream/COPY-Hip.cpp | 24 +---------------- src/stream/DOT-Cuda.cpp | 24 +---------------- src/stream/DOT-Hip.cpp | 24 +---------------- src/stream/MUL-Cuda.cpp | 24 +---------------- src/stream/MUL-Hip.cpp | 24 +---------------- src/stream/TRIAD-Cuda.cpp | 24 +---------------- src/stream/TRIAD-Hip.cpp | 24 +---------------- 105 files changed, 130 insertions(+), 2392 deletions(-) diff --git a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp index cf059127a..c19d0770e 100644 --- a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp @@ -162,29 +162,7 @@ void DEL_DOT_VEC_2D::runCudaVariantImpl(VariantID vid) } } -void DEL_DOT_VEC_2D::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void DEL_DOT_VEC_2D::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DEL_DOT_VEC_2D, Cuda) } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/DEL_DOT_VEC_2D-Hip.cpp b/src/apps/DEL_DOT_VEC_2D-Hip.cpp index 24bdb0345..782e4099c 100644 --- a/src/apps/DEL_DOT_VEC_2D-Hip.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Hip.cpp @@ -164,29 +164,7 @@ void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid) } } -void DEL_DOT_VEC_2D::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void DEL_DOT_VEC_2D::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DEL_DOT_VEC_2D, Hip) } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/ENERGY-Cuda.cpp b/src/apps/ENERGY-Cuda.cpp index 2d334e4f1..a99a928e3 100644 --- a/src/apps/ENERGY-Cuda.cpp +++ b/src/apps/ENERGY-Cuda.cpp @@ -268,29 +268,7 @@ void ENERGY::runCudaVariantImpl(VariantID vid) } } -void ENERGY::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void ENERGY::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(ENERGY, Cuda) } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/ENERGY-Hip.cpp b/src/apps/ENERGY-Hip.cpp index 1c3fcd2d3..e7e882cff 100644 --- a/src/apps/ENERGY-Hip.cpp +++ b/src/apps/ENERGY-Hip.cpp @@ -262,29 +262,7 @@ void ENERGY::runHipVariantImpl(VariantID vid) } } -void ENERGY::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void ENERGY::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(ENERGY, Hip) } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/FIR-Cuda.cpp b/src/apps/FIR-Cuda.cpp index bba80e60a..4dea7c82e 100644 --- a/src/apps/FIR-Cuda.cpp +++ b/src/apps/FIR-Cuda.cpp @@ -147,29 +147,7 @@ void FIR::runCudaVariantImpl(VariantID vid) } } -void FIR::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void FIR::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(FIR, Cuda) } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/FIR-Hip.cpp b/src/apps/FIR-Hip.cpp index 32d65bdf1..42e3503e0 100644 --- a/src/apps/FIR-Hip.cpp +++ b/src/apps/FIR-Hip.cpp @@ -147,29 +147,7 @@ void FIR::runHipVariantImpl(VariantID vid) } } -void FIR::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void FIR::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(FIR, Hip) } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE-Cuda.cpp b/src/apps/HALOEXCHANGE-Cuda.cpp index a297d1885..cab4f911d 100644 --- a/src/apps/HALOEXCHANGE-Cuda.cpp +++ b/src/apps/HALOEXCHANGE-Cuda.cpp @@ -169,29 +169,7 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) } } -void HALOEXCHANGE::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void HALOEXCHANGE::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(HALOEXCHANGE, Cuda) } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE-Hip.cpp b/src/apps/HALOEXCHANGE-Hip.cpp index a13f36619..4070edc72 100644 --- a/src/apps/HALOEXCHANGE-Hip.cpp +++ b/src/apps/HALOEXCHANGE-Hip.cpp @@ -171,29 +171,7 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) } } -void HALOEXCHANGE::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void HALOEXCHANGE::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(HALOEXCHANGE, Hip) } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp index 0aad4f3d0..52d1fca5c 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp @@ -270,29 +270,7 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) } } -void HALOEXCHANGE_FUSED::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void HALOEXCHANGE_FUSED::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(HALOEXCHANGE_FUSED, Cuda) } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp index d9809d37d..7d64d86f7 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp @@ -273,29 +273,7 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) } } -void HALOEXCHANGE_FUSED::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void HALOEXCHANGE_FUSED::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(HALOEXCHANGE_FUSED, Hip) } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/LTIMES-Cuda.cpp b/src/apps/LTIMES-Cuda.cpp index e54d76a13..4e38f769b 100644 --- a/src/apps/LTIMES-Cuda.cpp +++ b/src/apps/LTIMES-Cuda.cpp @@ -188,29 +188,7 @@ void LTIMES::runCudaVariantImpl(VariantID vid) } } -void LTIMES::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void LTIMES::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(LTIMES, Cuda) } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/LTIMES-Hip.cpp b/src/apps/LTIMES-Hip.cpp index 6a8c72917..a78394d25 100644 --- a/src/apps/LTIMES-Hip.cpp +++ b/src/apps/LTIMES-Hip.cpp @@ -190,29 +190,7 @@ void LTIMES::runHipVariantImpl(VariantID vid) } } -void LTIMES::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void LTIMES::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(LTIMES, Hip) } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/LTIMES_NOVIEW-Cuda.cpp b/src/apps/LTIMES_NOVIEW-Cuda.cpp index 769ede624..b363f0049 100644 --- a/src/apps/LTIMES_NOVIEW-Cuda.cpp +++ b/src/apps/LTIMES_NOVIEW-Cuda.cpp @@ -186,29 +186,7 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid) } } -void LTIMES_NOVIEW::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void LTIMES_NOVIEW::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(LTIMES_NOVIEW, Cuda) } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/LTIMES_NOVIEW-Hip.cpp b/src/apps/LTIMES_NOVIEW-Hip.cpp index ec9256ae0..47a8c8956 100644 --- a/src/apps/LTIMES_NOVIEW-Hip.cpp +++ b/src/apps/LTIMES_NOVIEW-Hip.cpp @@ -189,29 +189,7 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid) } } -void LTIMES_NOVIEW::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void LTIMES_NOVIEW::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(LTIMES_NOVIEW, Hip) } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp index b4a2fc771..e8aadcb2b 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp @@ -113,29 +113,7 @@ void NODAL_ACCUMULATION_3D::runCudaVariantImpl(VariantID vid) } } -void NODAL_ACCUMULATION_3D::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void NODAL_ACCUMULATION_3D::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(NODAL_ACCUMULATION_3D, Cuda) } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp index c6d8a14a9..09cea6211 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp @@ -113,29 +113,7 @@ void NODAL_ACCUMULATION_3D::runHipVariantImpl(VariantID vid) } } -void NODAL_ACCUMULATION_3D::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void NODAL_ACCUMULATION_3D::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(NODAL_ACCUMULATION_3D, Hip) } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/PRESSURE-Cuda.cpp b/src/apps/PRESSURE-Cuda.cpp index 7fc9fc33a..14ad2ae34 100644 --- a/src/apps/PRESSURE-Cuda.cpp +++ b/src/apps/PRESSURE-Cuda.cpp @@ -136,29 +136,7 @@ void PRESSURE::runCudaVariantImpl(VariantID vid) } } -void PRESSURE::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void PRESSURE::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(PRESSURE, Cuda) } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/PRESSURE-Hip.cpp b/src/apps/PRESSURE-Hip.cpp index 8eb9abbf1..03c9e04fb 100644 --- a/src/apps/PRESSURE-Hip.cpp +++ b/src/apps/PRESSURE-Hip.cpp @@ -129,29 +129,7 @@ void PRESSURE::runHipVariantImpl(VariantID vid) } } -void PRESSURE::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void PRESSURE::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(PRESSURE, Hip) } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/VOL3D-Cuda.cpp b/src/apps/VOL3D-Cuda.cpp index 8e0ddf01d..3f65c1b8a 100644 --- a/src/apps/VOL3D-Cuda.cpp +++ b/src/apps/VOL3D-Cuda.cpp @@ -123,29 +123,7 @@ void VOL3D::runCudaVariantImpl(VariantID vid) } } -void VOL3D::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void VOL3D::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(VOL3D, Cuda) } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/VOL3D-Hip.cpp b/src/apps/VOL3D-Hip.cpp index 551a472db..70f121e09 100644 --- a/src/apps/VOL3D-Hip.cpp +++ b/src/apps/VOL3D-Hip.cpp @@ -123,29 +123,7 @@ void VOL3D::runHipVariantImpl(VariantID vid) } } -void VOL3D::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void VOL3D::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(VOL3D, Hip) } // end namespace apps } // end namespace rajaperf diff --git a/src/basic/DAXPY-Cuda.cpp b/src/basic/DAXPY-Cuda.cpp index ae0f72712..a87421c4f 100644 --- a/src/basic/DAXPY-Cuda.cpp +++ b/src/basic/DAXPY-Cuda.cpp @@ -110,29 +110,7 @@ void DAXPY::runCudaVariantImpl(VariantID vid) } } -void DAXPY::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void DAXPY::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DAXPY, Cuda) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/DAXPY-Hip.cpp b/src/basic/DAXPY-Hip.cpp index 71f80fba3..25810c19e 100644 --- a/src/basic/DAXPY-Hip.cpp +++ b/src/basic/DAXPY-Hip.cpp @@ -113,29 +113,7 @@ void DAXPY::runHipVariantImpl(VariantID vid) } } -void DAXPY::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void DAXPY::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DAXPY, Hip) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/DAXPY_ATOMIC-Cuda.cpp b/src/basic/DAXPY_ATOMIC-Cuda.cpp index 7cbf371c2..1e8210bd2 100644 --- a/src/basic/DAXPY_ATOMIC-Cuda.cpp +++ b/src/basic/DAXPY_ATOMIC-Cuda.cpp @@ -110,29 +110,7 @@ void DAXPY_ATOMIC::runCudaVariantImpl(VariantID vid) } } -void DAXPY_ATOMIC::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void DAXPY_ATOMIC::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DAXPY_ATOMIC, Cuda) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/DAXPY_ATOMIC-Hip.cpp b/src/basic/DAXPY_ATOMIC-Hip.cpp index ff914387b..a1e7a6465 100644 --- a/src/basic/DAXPY_ATOMIC-Hip.cpp +++ b/src/basic/DAXPY_ATOMIC-Hip.cpp @@ -112,29 +112,7 @@ void DAXPY_ATOMIC::runHipVariantImpl(VariantID vid) } } -void DAXPY_ATOMIC::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void DAXPY_ATOMIC::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DAXPY_ATOMIC, Hip) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/IF_QUAD-Cuda.cpp b/src/basic/IF_QUAD-Cuda.cpp index d2724913d..66146371c 100644 --- a/src/basic/IF_QUAD-Cuda.cpp +++ b/src/basic/IF_QUAD-Cuda.cpp @@ -117,29 +117,7 @@ void IF_QUAD::runCudaVariantImpl(VariantID vid) } } -void IF_QUAD::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void IF_QUAD::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(IF_QUAD, Cuda) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/IF_QUAD-Hip.cpp b/src/basic/IF_QUAD-Hip.cpp index 1d0015a17..6ded209a9 100644 --- a/src/basic/IF_QUAD-Hip.cpp +++ b/src/basic/IF_QUAD-Hip.cpp @@ -120,29 +120,7 @@ void IF_QUAD::runHipVariantImpl(VariantID vid) } } -void IF_QUAD::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void IF_QUAD::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(IF_QUAD, Hip) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT3-Cuda.cpp b/src/basic/INIT3-Cuda.cpp index 3faa015fb..212a1e3a2 100644 --- a/src/basic/INIT3-Cuda.cpp +++ b/src/basic/INIT3-Cuda.cpp @@ -119,29 +119,7 @@ void INIT3::runCudaVariantImpl(VariantID vid) } } -void INIT3::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void INIT3::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INIT3, Cuda) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT3-Hip.cpp b/src/basic/INIT3-Hip.cpp index 2a575e9ec..af3276a7d 100644 --- a/src/basic/INIT3-Hip.cpp +++ b/src/basic/INIT3-Hip.cpp @@ -121,29 +121,7 @@ void INIT3::runHipVariantImpl(VariantID vid) } } -void INIT3::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void INIT3::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INIT3, Hip) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT_VIEW1D-Cuda.cpp b/src/basic/INIT_VIEW1D-Cuda.cpp index 6e8665ea2..be7a0bf97 100644 --- a/src/basic/INIT_VIEW1D-Cuda.cpp +++ b/src/basic/INIT_VIEW1D-Cuda.cpp @@ -110,29 +110,7 @@ void INIT_VIEW1D::runCudaVariantImpl(VariantID vid) } } -void INIT_VIEW1D::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void INIT_VIEW1D::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INIT_VIEW1D, Cuda) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT_VIEW1D-Hip.cpp b/src/basic/INIT_VIEW1D-Hip.cpp index ea46befeb..6f9d41924 100644 --- a/src/basic/INIT_VIEW1D-Hip.cpp +++ b/src/basic/INIT_VIEW1D-Hip.cpp @@ -113,29 +113,7 @@ void INIT_VIEW1D::runHipVariantImpl(VariantID vid) } } -void INIT_VIEW1D::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void INIT_VIEW1D::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INIT_VIEW1D, Hip) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp index c64c1f391..2f7f6d34a 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp @@ -113,29 +113,7 @@ void INIT_VIEW1D_OFFSET::runCudaVariantImpl(VariantID vid) } } -void INIT_VIEW1D_OFFSET::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void INIT_VIEW1D_OFFSET::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INIT_VIEW1D_OFFSET, Cuda) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp index 299c293e6..ae98f56ab 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp @@ -114,29 +114,7 @@ void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid) } } -void INIT_VIEW1D_OFFSET::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void INIT_VIEW1D_OFFSET::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INIT_VIEW1D_OFFSET, Hip) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/MAT_MAT_SHARED-Cuda.cpp b/src/basic/MAT_MAT_SHARED-Cuda.cpp index b20ab61f0..c28a51ec1 100644 --- a/src/basic/MAT_MAT_SHARED-Cuda.cpp +++ b/src/basic/MAT_MAT_SHARED-Cuda.cpp @@ -305,29 +305,7 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) } } -void MAT_MAT_SHARED::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void MAT_MAT_SHARED::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MAT_MAT_SHARED, Cuda) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/MAT_MAT_SHARED-Hip.cpp b/src/basic/MAT_MAT_SHARED-Hip.cpp index 2054efde0..d829914b5 100644 --- a/src/basic/MAT_MAT_SHARED-Hip.cpp +++ b/src/basic/MAT_MAT_SHARED-Hip.cpp @@ -308,29 +308,7 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) } } -void MAT_MAT_SHARED::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void MAT_MAT_SHARED::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MAT_MAT_SHARED, Hip) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/MULADDSUB-Cuda.cpp b/src/basic/MULADDSUB-Cuda.cpp index a61585806..3d8254c07 100644 --- a/src/basic/MULADDSUB-Cuda.cpp +++ b/src/basic/MULADDSUB-Cuda.cpp @@ -119,29 +119,7 @@ void MULADDSUB::runCudaVariantImpl(VariantID vid) } } -void MULADDSUB::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void MULADDSUB::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MULADDSUB, Cuda) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/MULADDSUB-Hip.cpp b/src/basic/MULADDSUB-Hip.cpp index 4cc0bbfa7..cb9076b38 100644 --- a/src/basic/MULADDSUB-Hip.cpp +++ b/src/basic/MULADDSUB-Hip.cpp @@ -121,29 +121,7 @@ void MULADDSUB::runHipVariantImpl(VariantID vid) } } -void MULADDSUB::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void MULADDSUB::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MULADDSUB, Hip) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/NESTED_INIT-Cuda.cpp b/src/basic/NESTED_INIT-Cuda.cpp index a51c0d563..7528c5cec 100644 --- a/src/basic/NESTED_INIT-Cuda.cpp +++ b/src/basic/NESTED_INIT-Cuda.cpp @@ -172,29 +172,7 @@ void NESTED_INIT::runCudaVariantImpl(VariantID vid) } } -void NESTED_INIT::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void NESTED_INIT::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(NESTED_INIT, Cuda) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/NESTED_INIT-Hip.cpp b/src/basic/NESTED_INIT-Hip.cpp index 9667a0622..49c050f6f 100644 --- a/src/basic/NESTED_INIT-Hip.cpp +++ b/src/basic/NESTED_INIT-Hip.cpp @@ -174,29 +174,7 @@ void NESTED_INIT::runHipVariantImpl(VariantID vid) } } -void NESTED_INIT::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void NESTED_INIT::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(NESTED_INIT, Hip) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_ATOMIC-Cuda.cpp b/src/basic/PI_ATOMIC-Cuda.cpp index fa619d246..6f28f8c2a 100644 --- a/src/basic/PI_ATOMIC-Cuda.cpp +++ b/src/basic/PI_ATOMIC-Cuda.cpp @@ -125,29 +125,7 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) } } -void PI_ATOMIC::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void PI_ATOMIC::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(PI_ATOMIC, Cuda) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp index 85f08df66..605696676 100644 --- a/src/basic/PI_ATOMIC-Hip.cpp +++ b/src/basic/PI_ATOMIC-Hip.cpp @@ -127,29 +127,7 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) } } -void PI_ATOMIC::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void PI_ATOMIC::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(PI_ATOMIC, Hip) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index 28c0c470f..80c8fd3b4 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -117,29 +117,7 @@ void PI_REDUCE::runCudaVariantImpl(VariantID vid) } } -void PI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void PI_REDUCE::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(PI_REDUCE, Cuda) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index 908517140..bb34ed37e 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -116,29 +116,7 @@ void PI_REDUCE::runHipVariantImpl(VariantID vid) } } -void PI_REDUCE::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void PI_REDUCE::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(PI_REDUCE, Hip) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index 8940eb405..0e7c645e7 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -161,29 +161,7 @@ void REDUCE3_INT::runCudaVariantImpl(VariantID vid) } } -void REDUCE3_INT::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void REDUCE3_INT::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(REDUCE3_INT, Cuda) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index 9b730bbcd..8e92cb123 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -161,29 +161,7 @@ void REDUCE3_INT::runHipVariantImpl(VariantID vid) } } -void REDUCE3_INT::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void REDUCE3_INT::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(REDUCE3_INT, Hip) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index 4d3d74012..d2845cbfd 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -151,29 +151,7 @@ void TRAP_INT::runCudaVariantImpl(VariantID vid) } } -void TRAP_INT::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void TRAP_INT::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(TRAP_INT, Cuda) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index 506028dbd..63101962f 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -150,29 +150,7 @@ void TRAP_INT::runHipVariantImpl(VariantID vid) } } -void TRAP_INT::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void TRAP_INT::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(TRAP_INT, Hip) } // end namespace basic } // end namespace rajaperf diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 41db9e3fb..97acbba0e 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -165,4 +165,30 @@ inline void seq_for(camp::int_seq const&, Func&& func) } // closing brace for rajaperf namespace +// +#define RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(kernel, variant) \ + void kernel::run##variant##Variant(VariantID vid, size_t tune_idx) \ + { \ + size_t t = 0; \ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { \ + if (run_params.numValidGPUBlockSize() == 0u || \ + run_params.validGPUBlockSize(block_size)) { \ + if (tune_idx == t) { \ + run##variant##VariantImpl(vid); \ + } \ + t += 1; \ + } \ + }); \ + } \ + \ + void kernel::set##variant##TuningDefinitions(VariantID vid) \ + { \ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { \ + if (run_params.numValidGPUBlockSize() == 0u || \ + run_params.validGPUBlockSize(block_size)) { \ + addVariantTuningName(vid, "block_"+std::to_string(block_size)); \ + } \ + }); \ + } + #endif // closing endif for header file include guard diff --git a/src/lcals/DIFF_PREDICT-Cuda.cpp b/src/lcals/DIFF_PREDICT-Cuda.cpp index df3b6b7f2..0ef286507 100644 --- a/src/lcals/DIFF_PREDICT-Cuda.cpp +++ b/src/lcals/DIFF_PREDICT-Cuda.cpp @@ -92,29 +92,7 @@ void DIFF_PREDICT::runCudaVariantImpl(VariantID vid) } } -void DIFF_PREDICT::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void DIFF_PREDICT::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DIFF_PREDICT, Cuda) } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/DIFF_PREDICT-Hip.cpp b/src/lcals/DIFF_PREDICT-Hip.cpp index ce14aa340..4f076157d 100644 --- a/src/lcals/DIFF_PREDICT-Hip.cpp +++ b/src/lcals/DIFF_PREDICT-Hip.cpp @@ -92,29 +92,7 @@ void DIFF_PREDICT::runHipVariantImpl(VariantID vid) } } -void DIFF_PREDICT::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void DIFF_PREDICT::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DIFF_PREDICT, Hip) } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/EOS-Cuda.cpp b/src/lcals/EOS-Cuda.cpp index deefa8c07..f99828d46 100644 --- a/src/lcals/EOS-Cuda.cpp +++ b/src/lcals/EOS-Cuda.cpp @@ -96,29 +96,7 @@ void EOS::runCudaVariantImpl(VariantID vid) } } -void EOS::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void EOS::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(EOS, Cuda) } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/EOS-Hip.cpp b/src/lcals/EOS-Hip.cpp index 58e9bbd61..0912ce5ce 100644 --- a/src/lcals/EOS-Hip.cpp +++ b/src/lcals/EOS-Hip.cpp @@ -96,29 +96,7 @@ void EOS::runHipVariantImpl(VariantID vid) } } -void EOS::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void EOS::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(EOS, Hip) } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_DIFF-Cuda.cpp b/src/lcals/FIRST_DIFF-Cuda.cpp index 4f375c2f4..b195c0d46 100644 --- a/src/lcals/FIRST_DIFF-Cuda.cpp +++ b/src/lcals/FIRST_DIFF-Cuda.cpp @@ -90,29 +90,7 @@ void FIRST_DIFF::runCudaVariantImpl(VariantID vid) } } -void FIRST_DIFF::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void FIRST_DIFF::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(FIRST_DIFF, Cuda) } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_DIFF-Hip.cpp b/src/lcals/FIRST_DIFF-Hip.cpp index 0f6161bbc..382fa107e 100644 --- a/src/lcals/FIRST_DIFF-Hip.cpp +++ b/src/lcals/FIRST_DIFF-Hip.cpp @@ -90,29 +90,7 @@ void FIRST_DIFF::runHipVariantImpl(VariantID vid) } } -void FIRST_DIFF::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void FIRST_DIFF::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(FIRST_DIFF, Hip) } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp index 1dc35a255..f98982860 100644 --- a/src/lcals/FIRST_MIN-Cuda.cpp +++ b/src/lcals/FIRST_MIN-Cuda.cpp @@ -130,29 +130,7 @@ void FIRST_MIN::runCudaVariantImpl(VariantID vid) } } -void FIRST_MIN::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void FIRST_MIN::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(FIRST_MIN, Cuda) } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp index 1c09f9cb0..e2b2763cf 100644 --- a/src/lcals/FIRST_MIN-Hip.cpp +++ b/src/lcals/FIRST_MIN-Hip.cpp @@ -130,29 +130,7 @@ void FIRST_MIN::runHipVariantImpl(VariantID vid) } } -void FIRST_MIN::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void FIRST_MIN::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(FIRST_MIN, Hip) } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_SUM-Cuda.cpp b/src/lcals/FIRST_SUM-Cuda.cpp index 5b968c221..85db3d39d 100644 --- a/src/lcals/FIRST_SUM-Cuda.cpp +++ b/src/lcals/FIRST_SUM-Cuda.cpp @@ -90,29 +90,7 @@ void FIRST_SUM::runCudaVariantImpl(VariantID vid) } } -void FIRST_SUM::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void FIRST_SUM::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(FIRST_SUM, Cuda) } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_SUM-Hip.cpp b/src/lcals/FIRST_SUM-Hip.cpp index 01c3eeabf..1a03619e5 100644 --- a/src/lcals/FIRST_SUM-Hip.cpp +++ b/src/lcals/FIRST_SUM-Hip.cpp @@ -90,29 +90,7 @@ void FIRST_SUM::runHipVariantImpl(VariantID vid) } } -void FIRST_SUM::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void FIRST_SUM::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(FIRST_SUM, Hip) } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/GEN_LIN_RECUR-Cuda.cpp b/src/lcals/GEN_LIN_RECUR-Cuda.cpp index 9bec47e46..76f840294 100644 --- a/src/lcals/GEN_LIN_RECUR-Cuda.cpp +++ b/src/lcals/GEN_LIN_RECUR-Cuda.cpp @@ -119,29 +119,7 @@ void GEN_LIN_RECUR::runCudaVariantImpl(VariantID vid) } } -void GEN_LIN_RECUR::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void GEN_LIN_RECUR::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(GEN_LIN_RECUR, Cuda) } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/GEN_LIN_RECUR-Hip.cpp b/src/lcals/GEN_LIN_RECUR-Hip.cpp index 3faaec08f..65fef4e8b 100644 --- a/src/lcals/GEN_LIN_RECUR-Hip.cpp +++ b/src/lcals/GEN_LIN_RECUR-Hip.cpp @@ -121,29 +121,7 @@ void GEN_LIN_RECUR::runHipVariantImpl(VariantID vid) } } -void GEN_LIN_RECUR::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void GEN_LIN_RECUR::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(GEN_LIN_RECUR, Hip) } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/HYDRO_1D-Cuda.cpp b/src/lcals/HYDRO_1D-Cuda.cpp index c1a85361d..901ca786b 100644 --- a/src/lcals/HYDRO_1D-Cuda.cpp +++ b/src/lcals/HYDRO_1D-Cuda.cpp @@ -94,29 +94,7 @@ void HYDRO_1D::runCudaVariantImpl(VariantID vid) } } -void HYDRO_1D::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void HYDRO_1D::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(HYDRO_1D, Cuda) } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/HYDRO_1D-Hip.cpp b/src/lcals/HYDRO_1D-Hip.cpp index 391c49a93..d39ec0f7e 100644 --- a/src/lcals/HYDRO_1D-Hip.cpp +++ b/src/lcals/HYDRO_1D-Hip.cpp @@ -94,29 +94,7 @@ void HYDRO_1D::runHipVariantImpl(VariantID vid) } } -void HYDRO_1D::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void HYDRO_1D::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(HYDRO_1D, Hip) } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/HYDRO_2D-Cuda.cpp b/src/lcals/HYDRO_2D-Cuda.cpp index 30acca046..21c320a60 100644 --- a/src/lcals/HYDRO_2D-Cuda.cpp +++ b/src/lcals/HYDRO_2D-Cuda.cpp @@ -221,29 +221,7 @@ void HYDRO_2D::runCudaVariantImpl(VariantID vid) } } -void HYDRO_2D::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void HYDRO_2D::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(HYDRO_2D, Cuda) } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/HYDRO_2D-Hip.cpp b/src/lcals/HYDRO_2D-Hip.cpp index 1e86920dc..3180c5c10 100644 --- a/src/lcals/HYDRO_2D-Hip.cpp +++ b/src/lcals/HYDRO_2D-Hip.cpp @@ -223,29 +223,7 @@ void HYDRO_2D::runHipVariantImpl(VariantID vid) } } -void HYDRO_2D::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void HYDRO_2D::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(HYDRO_2D, Hip) } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/INT_PREDICT-Cuda.cpp b/src/lcals/INT_PREDICT-Cuda.cpp index 7e6e76158..aaed2219e 100644 --- a/src/lcals/INT_PREDICT-Cuda.cpp +++ b/src/lcals/INT_PREDICT-Cuda.cpp @@ -95,29 +95,7 @@ void INT_PREDICT::runCudaVariantImpl(VariantID vid) } } -void INT_PREDICT::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void INT_PREDICT::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INT_PREDICT, Cuda) } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/INT_PREDICT-Hip.cpp b/src/lcals/INT_PREDICT-Hip.cpp index af5a7f3f6..22914bff3 100644 --- a/src/lcals/INT_PREDICT-Hip.cpp +++ b/src/lcals/INT_PREDICT-Hip.cpp @@ -95,29 +95,7 @@ void INT_PREDICT::runHipVariantImpl(VariantID vid) } } -void INT_PREDICT::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void INT_PREDICT::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INT_PREDICT, Hip) } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/PLANCKIAN-Cuda.cpp b/src/lcals/PLANCKIAN-Cuda.cpp index 574e37d1a..c831aab2e 100644 --- a/src/lcals/PLANCKIAN-Cuda.cpp +++ b/src/lcals/PLANCKIAN-Cuda.cpp @@ -99,29 +99,7 @@ void PLANCKIAN::runCudaVariantImpl(VariantID vid) } } -void PLANCKIAN::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void PLANCKIAN::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(PLANCKIAN, Cuda) } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/PLANCKIAN-Hip.cpp b/src/lcals/PLANCKIAN-Hip.cpp index cb784480e..1b8c6050b 100644 --- a/src/lcals/PLANCKIAN-Hip.cpp +++ b/src/lcals/PLANCKIAN-Hip.cpp @@ -99,29 +99,7 @@ void PLANCKIAN::runHipVariantImpl(VariantID vid) } } -void PLANCKIAN::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void PLANCKIAN::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(PLANCKIAN, Hip) } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/TRIDIAG_ELIM-Cuda.cpp b/src/lcals/TRIDIAG_ELIM-Cuda.cpp index 3d72d7d9a..654d027a9 100644 --- a/src/lcals/TRIDIAG_ELIM-Cuda.cpp +++ b/src/lcals/TRIDIAG_ELIM-Cuda.cpp @@ -95,29 +95,7 @@ void TRIDIAG_ELIM::runCudaVariantImpl(VariantID vid) } } -void TRIDIAG_ELIM::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void TRIDIAG_ELIM::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(TRIDIAG_ELIM, Cuda) } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/TRIDIAG_ELIM-Hip.cpp b/src/lcals/TRIDIAG_ELIM-Hip.cpp index 867c1fefa..dab19cc07 100644 --- a/src/lcals/TRIDIAG_ELIM-Hip.cpp +++ b/src/lcals/TRIDIAG_ELIM-Hip.cpp @@ -94,29 +94,7 @@ void TRIDIAG_ELIM::runHipVariantImpl(VariantID vid) } } -void TRIDIAG_ELIM::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void TRIDIAG_ELIM::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(TRIDIAG_ELIM, Hip) } // end namespace lcals } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_2MM-Cuda.cpp b/src/polybench/POLYBENCH_2MM-Cuda.cpp index ae33e8c12..40b1f5ca3 100644 --- a/src/polybench/POLYBENCH_2MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_2MM-Cuda.cpp @@ -275,29 +275,7 @@ void POLYBENCH_2MM::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_2MM::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_2MM::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_2MM, Cuda) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_2MM-Hip.cpp b/src/polybench/POLYBENCH_2MM-Hip.cpp index 2bf6f1912..15ffa80df 100644 --- a/src/polybench/POLYBENCH_2MM-Hip.cpp +++ b/src/polybench/POLYBENCH_2MM-Hip.cpp @@ -278,29 +278,7 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_2MM::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_2MM::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_2MM, Hip) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_3MM-Cuda.cpp b/src/polybench/POLYBENCH_3MM-Cuda.cpp index e81105c91..f9b151ebf 100644 --- a/src/polybench/POLYBENCH_3MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_3MM-Cuda.cpp @@ -353,29 +353,7 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_3MM::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_3MM::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_3MM, Cuda) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_3MM-Hip.cpp b/src/polybench/POLYBENCH_3MM-Hip.cpp index 9318fa363..4199f0c44 100644 --- a/src/polybench/POLYBENCH_3MM-Hip.cpp +++ b/src/polybench/POLYBENCH_3MM-Hip.cpp @@ -358,29 +358,7 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_3MM::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_3MM::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_3MM, Hip) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_ADI-Cuda.cpp b/src/polybench/POLYBENCH_ADI-Cuda.cpp index d2c913cf8..a4f92f213 100644 --- a/src/polybench/POLYBENCH_ADI-Cuda.cpp +++ b/src/polybench/POLYBENCH_ADI-Cuda.cpp @@ -248,29 +248,7 @@ void POLYBENCH_ADI::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_ADI::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_ADI::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_ADI, Cuda) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_ADI-Hip.cpp b/src/polybench/POLYBENCH_ADI-Hip.cpp index cbc8ea5cb..f87ec84f1 100644 --- a/src/polybench/POLYBENCH_ADI-Hip.cpp +++ b/src/polybench/POLYBENCH_ADI-Hip.cpp @@ -257,29 +257,7 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_ADI::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_ADI::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_ADI, Hip) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_ATAX-Cuda.cpp b/src/polybench/POLYBENCH_ATAX-Cuda.cpp index 02a623129..66b0d3218 100644 --- a/src/polybench/POLYBENCH_ATAX-Cuda.cpp +++ b/src/polybench/POLYBENCH_ATAX-Cuda.cpp @@ -230,29 +230,7 @@ void POLYBENCH_ATAX::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_ATAX::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_ATAX::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_ATAX, Cuda) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_ATAX-Hip.cpp b/src/polybench/POLYBENCH_ATAX-Hip.cpp index 780ff4616..8e1078c89 100644 --- a/src/polybench/POLYBENCH_ATAX-Hip.cpp +++ b/src/polybench/POLYBENCH_ATAX-Hip.cpp @@ -237,29 +237,7 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_ATAX::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_ATAX::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_ATAX, Hip) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp index 5657674da..6b4e8c636 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp @@ -320,29 +320,7 @@ void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_FDTD_2D::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_FDTD_2D::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_FDTD_2D, Cuda) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp index d6012a66f..0ca25f1e0 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp @@ -332,29 +332,7 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_FDTD_2D::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_FDTD_2D::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_FDTD_2D, Hip) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp index be2e3ccda..30e9a54b4 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp @@ -179,29 +179,7 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_FLOYD_WARSHALL::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_FLOYD_WARSHALL::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_FLOYD_WARSHALL, Cuda) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp index 5d9999a72..99b8ea303 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp @@ -183,29 +183,7 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_FLOYD_WARSHALL::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_FLOYD_WARSHALL::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_FLOYD_WARSHALL, Hip) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_GEMM-Cuda.cpp b/src/polybench/POLYBENCH_GEMM-Cuda.cpp index e5818f3e3..5101ebc00 100644 --- a/src/polybench/POLYBENCH_GEMM-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMM-Cuda.cpp @@ -205,29 +205,7 @@ void POLYBENCH_GEMM::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_GEMM::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_GEMM::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_GEMM, Cuda) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_GEMM-Hip.cpp b/src/polybench/POLYBENCH_GEMM-Hip.cpp index 4d99c5b3a..ed2c7fcff 100644 --- a/src/polybench/POLYBENCH_GEMM-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMM-Hip.cpp @@ -206,29 +206,7 @@ void POLYBENCH_GEMM::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_GEMM::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_GEMM::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_GEMM, Hip) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp index aaf7f765f..652bbf761 100644 --- a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp @@ -342,29 +342,7 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_GEMVER::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_GEMVER::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_GEMVER, Cuda) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_GEMVER-Hip.cpp b/src/polybench/POLYBENCH_GEMVER-Hip.cpp index efaeebd0f..943958e31 100644 --- a/src/polybench/POLYBENCH_GEMVER-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Hip.cpp @@ -350,29 +350,7 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_GEMVER::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_GEMVER::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_GEMVER, Hip) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp index c29a5fa3a..535e24efa 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp @@ -137,29 +137,7 @@ void POLYBENCH_GESUMMV::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_GESUMMV::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_GESUMMV::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_GESUMMV, Cuda) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp index 642b20ee4..ee39f9c6e 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp @@ -139,29 +139,7 @@ void POLYBENCH_GESUMMV::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_GESUMMV::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_GESUMMV::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_GESUMMV, Hip) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp index 6670f7289..ce6e7769e 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp @@ -221,29 +221,7 @@ void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_HEAT_3D::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_HEAT_3D::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_HEAT_3D, Cuda) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp index e6cdc193e..00e68aebd 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp @@ -227,29 +227,7 @@ void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_HEAT_3D::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_HEAT_3D::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_HEAT_3D, Hip) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp index d5643e929..a48e70a84 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp @@ -120,29 +120,7 @@ void POLYBENCH_JACOBI_1D::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_JACOBI_1D::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_JACOBI_1D::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_JACOBI_1D, Cuda) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp index 346801ca6..a5ff60dfc 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp @@ -122,29 +122,7 @@ void POLYBENCH_JACOBI_1D::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_JACOBI_1D::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_JACOBI_1D::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_JACOBI_1D, Hip) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp index a4ed964cb..ca6a485ec 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp @@ -211,29 +211,7 @@ void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_JACOBI_2D::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_JACOBI_2D::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_JACOBI_2D, Cuda) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp index 4755e3a59..bf03f9b86 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp @@ -217,29 +217,7 @@ void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_JACOBI_2D::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_JACOBI_2D::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_JACOBI_2D, Hip) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_MVT-Cuda.cpp b/src/polybench/POLYBENCH_MVT-Cuda.cpp index 11195118d..2795cadbb 100644 --- a/src/polybench/POLYBENCH_MVT-Cuda.cpp +++ b/src/polybench/POLYBENCH_MVT-Cuda.cpp @@ -178,29 +178,7 @@ void POLYBENCH_MVT::runCudaVariantImpl(VariantID vid) } } -void POLYBENCH_MVT::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_MVT::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_MVT, Cuda) } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_MVT-Hip.cpp b/src/polybench/POLYBENCH_MVT-Hip.cpp index 40defe841..176c41710 100644 --- a/src/polybench/POLYBENCH_MVT-Hip.cpp +++ b/src/polybench/POLYBENCH_MVT-Hip.cpp @@ -176,29 +176,7 @@ void POLYBENCH_MVT::runHipVariantImpl(VariantID vid) } } -void POLYBENCH_MVT::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void POLYBENCH_MVT::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(POLYBENCH_MVT, Hip) } // end namespace polybench } // end namespace rajaperf diff --git a/src/stream/ADD-Cuda.cpp b/src/stream/ADD-Cuda.cpp index d32f4c904..102774a13 100644 --- a/src/stream/ADD-Cuda.cpp +++ b/src/stream/ADD-Cuda.cpp @@ -111,29 +111,7 @@ void ADD::runCudaVariantImpl(VariantID vid) } } -void ADD::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void ADD::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(ADD, Cuda) } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/ADD-Hip.cpp b/src/stream/ADD-Hip.cpp index 28cf6d9c9..5e53500c8 100644 --- a/src/stream/ADD-Hip.cpp +++ b/src/stream/ADD-Hip.cpp @@ -113,29 +113,7 @@ void ADD::runHipVariantImpl(VariantID vid) } } -void ADD::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void ADD::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(ADD, Hip) } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/COPY-Cuda.cpp b/src/stream/COPY-Cuda.cpp index b4763b6aa..cddf986ac 100644 --- a/src/stream/COPY-Cuda.cpp +++ b/src/stream/COPY-Cuda.cpp @@ -109,29 +109,7 @@ void COPY::runCudaVariantImpl(VariantID vid) } } -void COPY::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void COPY::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(COPY, Cuda) } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/COPY-Hip.cpp b/src/stream/COPY-Hip.cpp index 4ea444f63..fe302a7fc 100644 --- a/src/stream/COPY-Hip.cpp +++ b/src/stream/COPY-Hip.cpp @@ -111,29 +111,7 @@ void COPY::runHipVariantImpl(VariantID vid) } } -void COPY::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void COPY::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(COPY, Hip) } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index 431668bff..de23c290b 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -130,29 +130,7 @@ void DOT::runCudaVariantImpl(VariantID vid) } } -void DOT::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void DOT::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DOT, Cuda) } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index f587099a5..3e75e64ef 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -132,29 +132,7 @@ void DOT::runHipVariantImpl(VariantID vid) } } -void DOT::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void DOT::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DOT, Hip) } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/MUL-Cuda.cpp b/src/stream/MUL-Cuda.cpp index 2009c5b99..8db12d087 100644 --- a/src/stream/MUL-Cuda.cpp +++ b/src/stream/MUL-Cuda.cpp @@ -109,29 +109,7 @@ void MUL::runCudaVariantImpl(VariantID vid) } } -void MUL::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void MUL::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MUL, Cuda) } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/MUL-Hip.cpp b/src/stream/MUL-Hip.cpp index 9d0dd6e1f..3e5e3f9f0 100644 --- a/src/stream/MUL-Hip.cpp +++ b/src/stream/MUL-Hip.cpp @@ -111,29 +111,7 @@ void MUL::runHipVariantImpl(VariantID vid) } } -void MUL::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void MUL::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MUL, Hip) } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/TRIAD-Cuda.cpp b/src/stream/TRIAD-Cuda.cpp index 686749df6..234683493 100644 --- a/src/stream/TRIAD-Cuda.cpp +++ b/src/stream/TRIAD-Cuda.cpp @@ -111,29 +111,7 @@ void TRIAD::runCudaVariantImpl(VariantID vid) } } -void TRIAD::runCudaVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantImpl(vid); - } - t += 1; - } - }); -} - -void TRIAD::setCudaTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(TRIAD, Cuda) } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/TRIAD-Hip.cpp b/src/stream/TRIAD-Hip.cpp index 69a9880e6..740727530 100644 --- a/src/stream/TRIAD-Hip.cpp +++ b/src/stream/TRIAD-Hip.cpp @@ -113,29 +113,7 @@ void TRIAD::runHipVariantImpl(VariantID vid) } } -void TRIAD::runHipVariant(VariantID vid, size_t tune_idx) -{ - size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantImpl(vid); - } - t += 1; - } - }); -} - -void TRIAD::setHipTuningDefinitions(VariantID vid) -{ - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - } - }); -} +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(TRIAD, Hip) } // end namespace stream } // end namespace rajaperf From ce85657713a4e9b269f6efce6ba7c3912251f04f Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Tue, 29 Mar 2022 11:51:49 -0700 Subject: [PATCH 283/392] fix policies in hip variant of MASS3DPA --- src/apps/MASS3DPA-Hip.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index 919a7e7b6..8803a98f2 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -138,11 +138,11 @@ void MASS3DPA::runHipVariant(VariantID vid) { using launch_policy = RAJA::expt::LaunchPolicy>; - using outer_x = RAJA::expt::LoopPolicy; + using outer_x = RAJA::expt::LoopPolicy; - using inner_x = RAJA::expt::LoopPolicy; + using inner_x = RAJA::expt::LoopPolicy; - using inner_y = RAJA::expt::LoopPolicy; + using inner_y = RAJA::expt::LoopPolicy; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { From 16c8640fbd19ea9e62f3e2ab6c1cdbac0dec878e Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Thu, 31 Mar 2022 12:58:11 -0400 Subject: [PATCH 284/392] Update REDUCE_STRUCT-Cuda.cpp --- src/basic/REDUCE_STRUCT-Cuda.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index d3db1fa2c..acb2b302d 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -122,7 +122,7 @@ void REDUCE_STRUCT::runCudaVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk(cudaMemset(mem, 0.0, 6*sizeof(Real_type))); + cudaErrchk(cudaMemsetAsync(mem, 0.0, 6*sizeof(Real_type))); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(particles.N, block_size); reduce_struct<<>> From ec5cef849948f822e653b0cf50ef4482afa26c7e Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Thu, 31 Mar 2022 13:24:32 -0400 Subject: [PATCH 285/392] Update REDUCE_STRUCT-Hip.cpp --- src/basic/REDUCE_STRUCT-Hip.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index 6884844b2..15d890b1b 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -122,7 +122,7 @@ void REDUCE_STRUCT::runHipVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk(hipMemset(mem, 0.0, 6*sizeof(Real_type))); + hipErrchk(hipMemsetAsync(mem, 0.0, 6*sizeof(Real_type))); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(particles.N, block_size); hipLaunchKernelGGL((reduce_struct), dim3(grid_size), dim3(block_size), 6*sizeof(Real_type)*block_size, 0, particles.x, particles.y, From f2bd5cf1cbb8576e63b56c4f5588eb9be51fa384 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 31 Mar 2022 13:49:05 -0700 Subject: [PATCH 286/392] Update kernels for block size tuning --- src/basic/INDEXLIST-Cuda.cpp | 8 +++++--- src/basic/INDEXLIST-Hip.cpp | 8 +++++--- src/basic/INDEXLIST.hpp | 28 +++++++++++++++++++--------- src/basic/INDEXLIST_3LOOP-Cuda.cpp | 23 ++++++++++++----------- src/basic/INDEXLIST_3LOOP-Hip.cpp | 23 ++++++++++++----------- src/basic/INDEXLIST_3LOOP.hpp | 28 +++++++++++++++++++--------- 6 files changed, 72 insertions(+), 46 deletions(-) diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index 50b243e44..fc7f3d7b8 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -27,9 +27,8 @@ namespace basic { // - // Define thread block size for CUDA execution + // Define magic numbers for CUDA execution // - const size_t block_size = 256; const size_t warp_size = 32; const size_t items_per_thread = 15; @@ -257,7 +256,8 @@ __global__ void indexlist(Real_ptr x, } } -void INDEXLIST::runCudaVariant(VariantID vid) +template < size_t block_size > +void INDEXLIST::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -310,6 +310,8 @@ void INDEXLIST::runCudaVariant(VariantID vid) } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INDEXLIST, Cuda) + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp index b76901608..6c1b3a7b2 100644 --- a/src/basic/INDEXLIST-Hip.cpp +++ b/src/basic/INDEXLIST-Hip.cpp @@ -27,9 +27,8 @@ namespace basic { // - // Define thread block size for HIP execution + // Define magic numbers for HIP execution // - const size_t block_size = 256; const size_t warp_size = 64; const size_t items_per_thread = 8; @@ -257,7 +256,8 @@ __global__ void indexlist(Real_ptr x, } } -void INDEXLIST::runHipVariant(VariantID vid) +template < size_t block_size > +void INDEXLIST::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -310,6 +310,8 @@ void INDEXLIST::runHipVariant(VariantID vid) } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INDEXLIST, Hip) + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INDEXLIST.hpp b/src/basic/INDEXLIST.hpp index f9c09058f..30a60be97 100644 --- a/src/basic/INDEXLIST.hpp +++ b/src/basic/INDEXLIST.hpp @@ -51,17 +51,27 @@ class INDEXLIST : public KernelBase ~INDEXLIST(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_x; Int_ptr m_list; Index_type m_len; diff --git a/src/basic/INDEXLIST_3LOOP-Cuda.cpp b/src/basic/INDEXLIST_3LOOP-Cuda.cpp index 8c9f1120e..e3c4607a9 100644 --- a/src/basic/INDEXLIST_3LOOP-Cuda.cpp +++ b/src/basic/INDEXLIST_3LOOP-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace basic { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define INDEXLIST_3LOOP_DATA_SETUP_CUDA \ Index_type* counts; \ allocCudaDeviceData(counts, getActualProblemSize()+1); \ @@ -40,23 +34,27 @@ namespace basic deallocCudaDeviceData(list); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void indexlist_conditional(Real_ptr x, Int_ptr list, Index_type* counts, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void indexlist_make_list(Int_ptr list, Index_type* counts, Index_type* len, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { INDEXLIST_3LOOP_MAKE_LIST; if (i == iend-1) { @@ -66,7 +64,8 @@ __global__ void indexlist_make_list(Int_ptr list, } -void INDEXLIST_3LOOP::runCudaVariant(VariantID vid) +template < size_t block_size > +void INDEXLIST_3LOOP::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -105,7 +104,7 @@ void INDEXLIST_3LOOP::runCudaVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - indexlist_conditional<<>>( + indexlist_conditional<<>>( x, list, counts, iend ); cudaErrchk( cudaGetLastError() ); @@ -118,7 +117,7 @@ void INDEXLIST_3LOOP::runCudaVariant(VariantID vid) scan_size, stream)); - indexlist_make_list<<>>( + indexlist_make_list<<>>( list, counts, len, iend ); cudaErrchk( cudaGetLastError() ); @@ -172,6 +171,8 @@ void INDEXLIST_3LOOP::runCudaVariant(VariantID vid) } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INDEXLIST_3LOOP, Cuda) + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INDEXLIST_3LOOP-Hip.cpp b/src/basic/INDEXLIST_3LOOP-Hip.cpp index d44d36238..2a411709e 100644 --- a/src/basic/INDEXLIST_3LOOP-Hip.cpp +++ b/src/basic/INDEXLIST_3LOOP-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace basic { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define INDEXLIST_3LOOP_DATA_SETUP_HIP \ Index_type* counts; \ allocHipDeviceData(counts, getActualProblemSize()+1); \ @@ -40,23 +34,27 @@ namespace basic deallocHipDeviceData(list); +template < size_t block_size > +__launch_bounds__(block_size) __global__ void indexlist_conditional(Real_ptr x, Int_ptr list, Index_type* counts, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0; } } +template < size_t block_size > +__launch_bounds__(block_size) __global__ void indexlist_make_list(Int_ptr list, Index_type* counts, Index_type* len, Index_type iend) { - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { INDEXLIST_3LOOP_MAKE_LIST; if (i == iend-1) { @@ -66,7 +64,8 @@ __global__ void indexlist_make_list(Int_ptr list, } -void INDEXLIST_3LOOP::runHipVariant(VariantID vid) +template < size_t block_size > +void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -116,7 +115,7 @@ void INDEXLIST_3LOOP::runHipVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((indexlist_conditional), grid_size, block_size, 0, stream, + hipLaunchKernelGGL((indexlist_conditional), grid_size, block_size, 0, stream, x, list, counts, iend ); hipErrchk( hipGetLastError() ); @@ -140,7 +139,7 @@ void INDEXLIST_3LOOP::runHipVariant(VariantID vid) stream)); #endif - hipLaunchKernelGGL((indexlist_make_list), grid_size, block_size, 0, stream, + hipLaunchKernelGGL((indexlist_make_list), grid_size, block_size, 0, stream, list, counts, len, iend ); hipErrchk( hipGetLastError() ); @@ -194,6 +193,8 @@ void INDEXLIST_3LOOP::runHipVariant(VariantID vid) } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INDEXLIST_3LOOP, Hip) + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INDEXLIST_3LOOP.hpp b/src/basic/INDEXLIST_3LOOP.hpp index 8974ee3bf..2939d7329 100644 --- a/src/basic/INDEXLIST_3LOOP.hpp +++ b/src/basic/INDEXLIST_3LOOP.hpp @@ -62,17 +62,27 @@ class INDEXLIST_3LOOP : public KernelBase ~INDEXLIST_3LOOP(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::list_type; + Real_ptr m_x; Int_ptr m_list; Index_type m_len; From 9b05fcfad002b2b25e3047004a7320b927632cb0 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 31 Mar 2022 15:21:01 -0700 Subject: [PATCH 287/392] sort out RAJA unroll in perf suite --- src/apps/FEM_MACROS.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/apps/FEM_MACROS.hpp b/src/apps/FEM_MACROS.hpp index db98f5ff3..474ada22b 100644 --- a/src/apps/FEM_MACROS.hpp +++ b/src/apps/FEM_MACROS.hpp @@ -8,9 +8,9 @@ #ifndef RAJAPerf_FEM_MACROS_HPP #define RAJAPerf_FEM_MACROS_HPP -#define RAJAPERF_DIRECT_PRAGMA(X) _Pragma(#X) #if defined(USE_RAJAPERF_UNROLL) -#define RAJAPERF_UNROLL(N) RAJAPERF_DIRECT_PRAGMA(unroll(N)) +// If enabled uses RAJA's RAJA_UNROLL_COUNT which is always on +#define RAJAPERF_UNROLL(N) RAJA_UNROLL_COUNT(N) #else #define RAJAPERF_UNROLL(N) #endif From 17e079c6bc1934513d0b7b89b4d1af49667471ee Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 31 Mar 2022 18:14:30 -0700 Subject: [PATCH 288/392] fixup executor --- src/common/Executor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 587ea071f..7b5db6887 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -796,7 +796,7 @@ void Executor::runSuite() warmup_kernels.push_back(makeKernel()); warmup_kernels.push_back(makeKernel()); - warmup_kernels.push_back(makeKernel()); warmup_kernels.push_back(makeKernel()); warmup_kernels.push_back(makeKernel()); From 919ff4a5ab767d17b4c56d083c38b4e9e85faa9b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 31 Mar 2022 18:14:40 -0700 Subject: [PATCH 289/392] fixup output utils --- src/common/OutputUtils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/OutputUtils.cpp b/src/common/OutputUtils.cpp index 3d02360b1..96b09c542 100644 --- a/src/common/OutputUtils.cpp +++ b/src/common/OutputUtils.cpp @@ -31,8 +31,8 @@ namespace rajaperf */ std::string recursiveMkdir(const std::string& in_path) { - int rank = 0; #ifdef RAJA_PERFSUITE_ENABLE_MPI + int rank = 0; MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Processes wait for rank 0 to make the directories before proceeding From d9212886627f087498dbe03d5d58ff924fae1435 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 31 Mar 2022 18:15:45 -0700 Subject: [PATCH 290/392] Fix warning in DISSUSOIN3DPA --- src/apps/DIFFUSION3DPA-Cuda.cpp | 2 +- src/apps/DIFFUSION3DPA-Hip.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index 47ddf66f9..b7dd446a6 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -120,7 +120,7 @@ __global__ void Diffusion3DPA(const Real_ptr Basis, } } -void DIFFUSION3DPA::runCudaVariant(VariantID vid, size_t tune_idx) { +void DIFFUSION3DPA::runCudaVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); DIFFUSION3DPA_DATA_SETUP; diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index 8dd03a3ae..8448d1957 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -120,7 +120,7 @@ __global__ void Diffusion3DPA(const Real_ptr Basis, } } -void DIFFUSION3DPA::runHipVariant(VariantID vid, size_t tune_idx) { +void DIFFUSION3DPA::runHipVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); DIFFUSION3DPA_DATA_SETUP; From dddc645d68046591b88a7be708de9e1df8df0e35 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 31 Mar 2022 18:20:47 -0700 Subject: [PATCH 291/392] Add tune_idx to various methods --- src/algorithm/SCAN-Cuda.cpp | 2 +- src/algorithm/SCAN-Hip.cpp | 2 +- src/algorithm/SCAN-OMP.cpp | 2 +- src/algorithm/SCAN-Seq.cpp | 2 +- src/algorithm/SCAN.cpp | 8 ++++---- src/algorithm/SCAN.hpp | 21 +++++++++------------ src/basic/INDEXLIST-OMP.cpp | 2 +- src/basic/INDEXLIST-Seq.cpp | 2 +- src/basic/INDEXLIST.cpp | 10 +++++----- src/basic/INDEXLIST_3LOOP-OMP.cpp | 2 +- src/basic/INDEXLIST_3LOOP-Seq.cpp | 2 +- src/basic/INDEXLIST_3LOOP.cpp | 10 +++++----- 12 files changed, 31 insertions(+), 34 deletions(-) diff --git a/src/algorithm/SCAN-Cuda.cpp b/src/algorithm/SCAN-Cuda.cpp index a28f9dc26..e6819884f 100644 --- a/src/algorithm/SCAN-Cuda.cpp +++ b/src/algorithm/SCAN-Cuda.cpp @@ -40,7 +40,7 @@ namespace algorithm deallocCudaDeviceData(y); -void SCAN::runCudaVariant(VariantID vid) +void SCAN::runCudaVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/algorithm/SCAN-Hip.cpp b/src/algorithm/SCAN-Hip.cpp index c8313b68a..217f60a86 100644 --- a/src/algorithm/SCAN-Hip.cpp +++ b/src/algorithm/SCAN-Hip.cpp @@ -45,7 +45,7 @@ namespace algorithm deallocHipDeviceData(y); -void SCAN::runHipVariant(VariantID vid) +void SCAN::runHipVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/algorithm/SCAN-OMP.cpp b/src/algorithm/SCAN-OMP.cpp index 8ddec7f36..82eec6006 100644 --- a/src/algorithm/SCAN-OMP.cpp +++ b/src/algorithm/SCAN-OMP.cpp @@ -18,7 +18,7 @@ namespace rajaperf namespace algorithm { -void SCAN::runOpenMPVariant(VariantID vid) +void SCAN::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/algorithm/SCAN-Seq.cpp b/src/algorithm/SCAN-Seq.cpp index 38c546454..bc7769360 100644 --- a/src/algorithm/SCAN-Seq.cpp +++ b/src/algorithm/SCAN-Seq.cpp @@ -18,7 +18,7 @@ namespace algorithm { -void SCAN::runSeqVariant(VariantID vid) +void SCAN::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index 9a309b039..47fe9d76c 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -58,18 +58,18 @@ SCAN::~SCAN() { } -void SCAN::setUp(VariantID vid) +void SCAN::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataRandValue(m_x, getActualProblemSize(), vid); allocAndInitDataConst(m_y, getActualProblemSize(), 0.0, vid); } -void SCAN::updateChecksum(VariantID vid) +void SCAN::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid] += calcChecksum(m_y, getActualProblemSize(), checksum_scale_factor); + checksum[vid][tune_idx] += calcChecksum(m_y, getActualProblemSize(), checksum_scale_factor); } -void SCAN::tearDown(VariantID vid) +void SCAN::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_x); diff --git a/src/algorithm/SCAN.hpp b/src/algorithm/SCAN.hpp index 25c5556af..fe54e1673 100644 --- a/src/algorithm/SCAN.hpp +++ b/src/algorithm/SCAN.hpp @@ -52,18 +52,15 @@ class SCAN : public KernelBase ~SCAN(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); - - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid) - { - std::cout << "\n SCAN : Unknown OMP Target variant id = " << vid << std::endl; - } + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); private: Real_ptr m_x; diff --git a/src/basic/INDEXLIST-OMP.cpp b/src/basic/INDEXLIST-OMP.cpp index 576002da7..a1fe5888a 100644 --- a/src/basic/INDEXLIST-OMP.cpp +++ b/src/basic/INDEXLIST-OMP.cpp @@ -17,7 +17,7 @@ namespace rajaperf namespace basic { -void INDEXLIST::runOpenMPVariant(VariantID vid) +void INDEXLIST::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/INDEXLIST-Seq.cpp b/src/basic/INDEXLIST-Seq.cpp index e895ab73e..36b8f4803 100644 --- a/src/basic/INDEXLIST-Seq.cpp +++ b/src/basic/INDEXLIST-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void INDEXLIST::runSeqVariant(VariantID vid) +void INDEXLIST::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index d1ce5cc77..e19a6a15f 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -52,20 +52,20 @@ INDEXLIST::~INDEXLIST() { } -void INDEXLIST::setUp(VariantID vid) +void INDEXLIST::setUp(VariantID vid, size_t /*tune_idx*/) { allocAndInitDataRandSign(m_x, getActualProblemSize(), vid); allocAndInitData(m_list, getActualProblemSize(), vid); m_len = -1; } -void INDEXLIST::updateChecksum(VariantID vid) +void INDEXLIST::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid] += calcChecksum(m_list, getActualProblemSize()); - checksum[vid] += Checksum_type(m_len); + checksum[vid][tune_idx] += calcChecksum(m_list, getActualProblemSize()); + checksum[vid][tune_idx] += Checksum_type(m_len); } -void INDEXLIST::tearDown(VariantID vid) +void INDEXLIST::tearDown(VariantID vid, size_t /*tune_idx*/) { (void) vid; deallocData(m_x); diff --git a/src/basic/INDEXLIST_3LOOP-OMP.cpp b/src/basic/INDEXLIST_3LOOP-OMP.cpp index df8eec8b6..6a5ee22a8 100644 --- a/src/basic/INDEXLIST_3LOOP-OMP.cpp +++ b/src/basic/INDEXLIST_3LOOP-OMP.cpp @@ -24,7 +24,7 @@ namespace basic delete[] counts; counts = nullptr; -void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid) +void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/INDEXLIST_3LOOP-Seq.cpp b/src/basic/INDEXLIST_3LOOP-Seq.cpp index cfdc9dac7..d352f922b 100644 --- a/src/basic/INDEXLIST_3LOOP-Seq.cpp +++ b/src/basic/INDEXLIST_3LOOP-Seq.cpp @@ -25,7 +25,7 @@ namespace basic -void INDEXLIST_3LOOP::runSeqVariant(VariantID vid) +void INDEXLIST_3LOOP::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp index fff81d7fe..78ab796c1 100644 --- a/src/basic/INDEXLIST_3LOOP.cpp +++ b/src/basic/INDEXLIST_3LOOP.cpp @@ -60,20 +60,20 @@ INDEXLIST_3LOOP::~INDEXLIST_3LOOP() { } -void INDEXLIST_3LOOP::setUp(VariantID vid) +void INDEXLIST_3LOOP::setUp(VariantID vid, size_t tune_idx) { allocAndInitDataRandSign(m_x, getActualProblemSize(), vid); allocAndInitData(m_list, getActualProblemSize(), vid); m_len = -1; } -void INDEXLIST_3LOOP::updateChecksum(VariantID vid) +void INDEXLIST_3LOOP::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid] += calcChecksum(m_list, getActualProblemSize()); - checksum[vid] += Checksum_type(m_len); + checksum[vid][tune_idx] += calcChecksum(m_list, getActualProblemSize()); + checksum[vid][tune_idx] += Checksum_type(m_len); } -void INDEXLIST_3LOOP::tearDown(VariantID vid) +void INDEXLIST_3LOOP::tearDown(VariantID vid, size_t tune_idx) { (void) vid; deallocData(m_x); From 9b44eaacae384566a588443bfe64441380c5b523 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 31 Mar 2022 18:21:19 -0700 Subject: [PATCH 292/392] Fixup spacing --- src/basic/INDEXLIST_3LOOP.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/basic/INDEXLIST_3LOOP.hpp b/src/basic/INDEXLIST_3LOOP.hpp index 2939d7329..e81783778 100644 --- a/src/basic/INDEXLIST_3LOOP.hpp +++ b/src/basic/INDEXLIST_3LOOP.hpp @@ -36,10 +36,10 @@ Real_ptr x = m_x; \ Int_ptr list = m_list; -#define INDEXLIST_3LOOP_CONDITIONAL \ +#define INDEXLIST_3LOOP_CONDITIONAL \ x[i] < 0.0 -#define INDEXLIST_3LOOP_MAKE_LIST \ +#define INDEXLIST_3LOOP_MAKE_LIST \ if (counts[i] != counts[i+1]) { \ list[counts[i]] = i ; \ } From 190660d1cd121a01c7d35560f2405be8d3c37510 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 31 Mar 2022 18:26:28 -0700 Subject: [PATCH 293/392] Remove INDEXLIST RAJA_Seq impl Remove this case because there is no support for scan in loop in raja, so the RAJA_Seq implementation is not portable. --- src/basic/INDEXLIST-Seq.cpp | 20 -------------------- src/basic/INDEXLIST.cpp | 1 - 2 files changed, 21 deletions(-) diff --git a/src/basic/INDEXLIST-Seq.cpp b/src/basic/INDEXLIST-Seq.cpp index 36b8f4803..c9d9e6369 100644 --- a/src/basic/INDEXLIST-Seq.cpp +++ b/src/basic/INDEXLIST-Seq.cpp @@ -70,26 +70,6 @@ void INDEXLIST::runSeqVariant(VariantID vid, size_t /*tune_idx*/) break; } - - case RAJA_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - Index_type count = 0; - - RAJA::forall( RAJA::RangeSegment(ibegin, iend), - [=, &count](Index_type i) { - INDEXLIST_BODY; - }); - - m_len = count; - - } - stopTimer(); - - break; - } #endif default : { diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index e19a6a15f..f0c4c7d32 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -38,7 +38,6 @@ INDEXLIST::INDEXLIST(const RunParams& params) setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); - setVariantDefined( RAJA_Seq ); setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); From bcb82672ac4d81e4af7dfaac7b0a516ab9e1db0d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 31 Mar 2022 18:27:20 -0700 Subject: [PATCH 294/392] Use iend in INDEXLIST_3LOOP --- src/basic/INDEXLIST_3LOOP-Cuda.cpp | 2 +- src/basic/INDEXLIST_3LOOP-Hip.cpp | 2 +- src/basic/INDEXLIST_3LOOP-OMP.cpp | 2 +- src/basic/INDEXLIST_3LOOP-Seq.cpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/basic/INDEXLIST_3LOOP-Cuda.cpp b/src/basic/INDEXLIST_3LOOP-Cuda.cpp index e3c4607a9..40b6db128 100644 --- a/src/basic/INDEXLIST_3LOOP-Cuda.cpp +++ b/src/basic/INDEXLIST_3LOOP-Cuda.cpp @@ -23,7 +23,7 @@ namespace basic #define INDEXLIST_3LOOP_DATA_SETUP_CUDA \ Index_type* counts; \ - allocCudaDeviceData(counts, getActualProblemSize()+1); \ + allocCudaDeviceData(counts, iend+1); \ allocAndInitCudaDeviceData(x, m_x, iend); \ allocAndInitCudaDeviceData(list, m_list, iend); diff --git a/src/basic/INDEXLIST_3LOOP-Hip.cpp b/src/basic/INDEXLIST_3LOOP-Hip.cpp index 2a411709e..42e3211fd 100644 --- a/src/basic/INDEXLIST_3LOOP-Hip.cpp +++ b/src/basic/INDEXLIST_3LOOP-Hip.cpp @@ -23,7 +23,7 @@ namespace basic #define INDEXLIST_3LOOP_DATA_SETUP_HIP \ Index_type* counts; \ - allocHipDeviceData(counts, getActualProblemSize()+1); \ + allocHipDeviceData(counts, iend+1); \ allocAndInitHipDeviceData(x, m_x, iend); \ allocAndInitHipDeviceData(list, m_list, iend); diff --git a/src/basic/INDEXLIST_3LOOP-OMP.cpp b/src/basic/INDEXLIST_3LOOP-OMP.cpp index 6a5ee22a8..b8f223b0d 100644 --- a/src/basic/INDEXLIST_3LOOP-OMP.cpp +++ b/src/basic/INDEXLIST_3LOOP-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { #define INDEXLIST_3LOOP_DATA_SETUP_OMP \ - Index_type* counts = new Index_type[getActualProblemSize()+1]; + Index_type* counts = new Index_type[iend+1]; #define INDEXLIST_3LOOP_DATA_TEARDOWN_OMP \ delete[] counts; counts = nullptr; diff --git a/src/basic/INDEXLIST_3LOOP-Seq.cpp b/src/basic/INDEXLIST_3LOOP-Seq.cpp index d352f922b..f6505bf89 100644 --- a/src/basic/INDEXLIST_3LOOP-Seq.cpp +++ b/src/basic/INDEXLIST_3LOOP-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { #define INDEXLIST_3LOOP_DATA_SETUP_Seq \ - Index_type* counts = new Index_type[getActualProblemSize()+1]; + Index_type* counts = new Index_type[iend+1]; #define INDEXLIST_3LOOP_DATA_TEARDOWN_Seq \ delete[] counts; counts = nullptr; From 73d3d1668441b71a2c8aa2e513d3e806aab8f871 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 31 Mar 2022 18:29:05 -0700 Subject: [PATCH 295/392] Add openmp scan target impl This relies on the openmp scan support so it is only enabled with RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN --- src/CMakeLists.txt | 3 + src/algorithm/SCAN-OMPTarget.cpp | 80 ++++++++++++++++++ src/algorithm/SCAN.cpp | 4 + src/basic/CMakeLists.txt | 2 + src/basic/INDEXLIST-OMPTarget.cpp | 98 +++++++++++++++++++++ src/basic/INDEXLIST.cpp | 4 + src/basic/INDEXLIST_3LOOP-OMPTarget.cpp | 108 ++++++++++++++++++++++++ src/basic/INDEXLIST_3LOOP.cpp | 4 + 8 files changed, 303 insertions(+) create mode 100644 src/algorithm/SCAN-OMPTarget.cpp create mode 100644 src/basic/INDEXLIST-OMPTarget.cpp create mode 100644 src/basic/INDEXLIST_3LOOP-OMPTarget.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 380ef93f4..ff681a136 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -81,8 +81,10 @@ blt_add_executable( basic/IF_QUAD-OMPTarget.cpp basic/INDEXLIST.cpp basic/INDEXLIST-Seq.cpp + basic/INDEXLIST-OMPTarget.cpp basic/INDEXLIST_3LOOP.cpp basic/INDEXLIST_3LOOP-Seq.cpp + basic/INDEXLIST_3LOOP-OMPTarget.cpp basic/INIT3.cpp basic/INIT3-Seq.cpp basic/INIT3-OMPTarget.cpp @@ -209,6 +211,7 @@ blt_add_executable( common/RunParams.cpp algorithm/SCAN.cpp algorithm/SCAN-Seq.cpp + algorithm/SCAN-OMPTarget.cpp algorithm/SORT.cpp algorithm/SORT-Seq.cpp algorithm/SORTPAIRS.cpp diff --git a/src/algorithm/SCAN-OMPTarget.cpp b/src/algorithm/SCAN-OMPTarget.cpp new file mode 100644 index 000000000..0f453cac0 --- /dev/null +++ b/src/algorithm/SCAN-OMPTarget.cpp @@ -0,0 +1,80 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "SCAN.hpp" + +#include "RAJA/RAJA.hpp" + +#include +#include + +namespace rajaperf +{ +namespace algorithm +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + +#define SCAN_DATA_SETUP_OMP_TARGET \ + int hid = omp_get_initial_device(); \ + int did = omp_get_default_device(); \ + \ + allocAndInitOpenMPDeviceData(x, m_x, iend, did, hid); \ + allocAndInitOpenMPDeviceData(y, m_y, iend, did, hid); + +#define SCAN_DATA_TEARDOWN_OMP_TARGET \ + getOpenMPDeviceData(m_y, y, iend, hid, did); \ + deallocOpenMPDeviceData(x, did); \ + deallocOpenMPDeviceData(y, did); + + +void SCAN::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) \ + && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + SCAN_DATA_SETUP; + + switch ( vid ) { + + case Base_OpenMPTarget : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + SCAN_PROLOGUE; + + #pragma omp target is_device_ptr(x,y) device( did ) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) \ + reduction(inscan, +:scan_var) + for (Index_type i = ibegin; i < iend; ++i ) { + y[i] = scan_var; + #pragma omp scan exclusive(scan_var) + scan_var += x[i]; + } + + } + stopTimer(); + + break; + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +} // end namespace algorithm +} // end namespace rajaperf diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index 47fe9d76c..cda919ec2 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -47,6 +47,10 @@ SCAN::SCAN(const RunParams& params) setVariantDefined( Lambda_OpenMP ); setVariantDefined( RAJA_OpenMP ); +#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) + setVariantDefined( Base_OpenMPTarget ); +#endif + setVariantDefined( Base_CUDA ); setVariantDefined( RAJA_CUDA ); diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index df08bcbe7..1c7583b5d 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -31,11 +31,13 @@ blt_add_library( INDEXLIST-Hip.cpp INDEXLIST-Cuda.cpp INDEXLIST-OMP.cpp + INDEXLIST-OMPTarget.cpp INDEXLIST_3LOOP.cpp INDEXLIST_3LOOP-Seq.cpp INDEXLIST_3LOOP-Hip.cpp INDEXLIST_3LOOP-Cuda.cpp INDEXLIST_3LOOP-OMP.cpp + INDEXLIST_3LOOP-OMPTarget.cpp INIT3.cpp INIT3-Seq.cpp INIT3-Hip.cpp diff --git a/src/basic/INDEXLIST-OMPTarget.cpp b/src/basic/INDEXLIST-OMPTarget.cpp new file mode 100644 index 000000000..b15c46602 --- /dev/null +++ b/src/basic/INDEXLIST-OMPTarget.cpp @@ -0,0 +1,98 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INDEXLIST.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) \ + && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + +#define INDEXLIST_DATA_SETUP_OMP_TARGET \ + int hid = omp_get_initial_device(); \ + int did = omp_get_default_device(); \ + \ + allocAndInitOpenMPDeviceData(x, m_x, iend, did, hid); \ + allocAndInitOpenMPDeviceData(list, m_list, iend, did, hid); + +#define INDEXLIST_DATA_TEARDOWN_OMP_TARGET \ + getOpenMPDeviceData(m_list, list, iend, hid, did); \ + deallocOpenMPDeviceData(x, did); \ + deallocOpenMPDeviceData(list, did); +#endif + + +void INDEXLIST::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) \ + && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + INDEXLIST_DATA_SETUP; + + switch ( vid ) { + + case Base_OpenMPTarget : { + + INDEXLIST_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Index_type count = 0; + #pragma omp target is_device_ptr(x, list) device( did ) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) \ + reduction(inscan, +:count) + for (Index_type i = ibegin; i < iend; ++i ) { + Index_type inc = 0; + if (INDEXLIST_CONDITIONAL) { + list[count] = i ; + inc = 1; + } + #pragma omp scan exclusive(count) + count += inc; + } + + m_len = count; + + } + stopTimer(); + + INDEXLIST_DATA_TEARDOWN_OMP_TARGET; + + break; + } + + default : { + ignore_unused(run_reps, ibegin, iend, x, list); + std::cout << "\n INDEXLIST : Unknown variant id = " << vid << std::endl; + } + + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index f0c4c7d32..c5367487b 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -42,6 +42,10 @@ INDEXLIST::INDEXLIST(const RunParams& params) setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); +#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) + setVariantDefined( Base_OpenMPTarget ); +#endif + setVariantDefined( Base_CUDA ); setVariantDefined( Base_HIP ); diff --git a/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp b/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp new file mode 100644 index 000000000..7c6ff734a --- /dev/null +++ b/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp @@ -0,0 +1,108 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INDEXLIST_3LOOP.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + +#define INDEXLIST_3LOOP_DATA_SETUP_OMP_TARGET \ + int hid = omp_get_initial_device(); \ + int did = omp_get_default_device(); \ + \ + Index_type* counts = nullptr; \ + allocOpenMPDeviceData(counts, iend+1, did); \ + allocAndInitOpenMPDeviceData(x, m_x, iend, did, hid); \ + allocAndInitOpenMPDeviceData(list, m_list, iend, did, hid); + +#define INDEXLIST_3LOOP_DATA_TEARDOWN_OMP_TARGET \ + deallocOpenMPDeviceData(counts, did); \ + getOpenMPDeviceData(m_list, list, iend, hid, did); \ + deallocOpenMPDeviceData(x, did); \ + deallocOpenMPDeviceData(list, did); + + +void INDEXLIST_3LOOP::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) \ + && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + INDEXLIST_3LOOP_DATA_SETUP; + + switch ( vid ) { + + case Base_OpenMPTarget : { + + INDEXLIST_3LOOP_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + + #pragma omp target is_device_ptr(counts, x) device( did ) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { + counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0; + } + + Index_type count = 0; + #pragma omp target is_device_ptr(counts) device( did ) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) \ + reduction(inscan, +:count) + for (Index_type i = ibegin; i < iend+1; ++i ) { + Index_type inc = counts[i]; + counts[i] = count; + #pragma omp scan exclusive(count) + count += inc; + } + + #pragma omp target is_device_ptr(counts, list) device( did ) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { + INDEXLIST_3LOOP_MAKE_LIST; + } + + m_len = counts[iend]; + + } + stopTimer(); + + INDEXLIST_3LOOP_DATA_TEARDOWN_OMP_TARGET; + + break; + } + + default : { + std::cout << "\n INDEXLIST_3LOOP : Unknown variant id = " << vid << std::endl; + } + + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp index 78ab796c1..213c936aa 100644 --- a/src/basic/INDEXLIST_3LOOP.cpp +++ b/src/basic/INDEXLIST_3LOOP.cpp @@ -49,6 +49,10 @@ INDEXLIST_3LOOP::INDEXLIST_3LOOP(const RunParams& params) setVariantDefined( Lambda_OpenMP ); setVariantDefined( RAJA_OpenMP ); +#if _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) + setVariantDefined( Base_OpenMPTarget ); +#endif + setVariantDefined( Base_CUDA ); setVariantDefined( RAJA_CUDA ); From 751ed76745f428bdd9a69885285b604a95818f45 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 31 Mar 2022 18:47:27 -0700 Subject: [PATCH 296/392] Fix warnings in INDEXLIST Cuda/Hip --- src/basic/INDEXLIST-Cuda.cpp | 10 +++++----- src/basic/INDEXLIST-Hip.cpp | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index fc7f3d7b8..f0eb60e96 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -82,7 +82,7 @@ __device__ void grid_scan(const int block_id, BlockScan(s_temp_storage.block_scan_storage).ExclusiveSum(val, exclusive); __syncthreads(); - for (int ti = 0; ti < items_per_thread; ++ti) { + for (size_t ti = 0; ti < items_per_thread; ++ti) { inclusive[ti] = exclusive[ti] + val[ti]; } @@ -195,13 +195,13 @@ __device__ void grid_scan(const int block_id, __syncthreads(); Index_type prev_grid_count = s_temp_storage.prev_grid_count; - for (int ti = 0; ti < items_per_thread; ++ti) { + for (size_t ti = 0; ti < items_per_thread; ++ti) { exclusive[ti] = prev_grid_count + exclusive[ti]; inclusive[ti] = prev_grid_count + inclusive[ti]; } if (last_block) { - for (int i = threadIdx.x; i < gridDim.x-1; i += block_size) { + for (unsigned i = threadIdx.x; i < gridDim.x-1; i += block_size) { while (atomicCAS(&block_readys[i], 2u, 0u) != 2u); } } @@ -225,7 +225,7 @@ __global__ void indexlist(Real_ptr x, Index_type vals[items_per_thread]; - for (Index_type ti = 0; ti < items_per_thread; ++ti) { + for (size_t ti = 0; ti < items_per_thread; ++ti) { Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x; Index_type val = 0; if (i < iend) { @@ -241,7 +241,7 @@ __global__ void indexlist(Real_ptr x, grid_scan( block_id, vals, exclusives, inclusives, block_counts, grid_counts, block_readys); - for (Index_type ti = 0; ti < items_per_thread; ++ti) { + for (size_t ti = 0; ti < items_per_thread; ++ti) { Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x; Index_type exclusive = exclusives[ti]; Index_type inclusive = inclusives[ti]; diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp index 6c1b3a7b2..6875dc5c1 100644 --- a/src/basic/INDEXLIST-Hip.cpp +++ b/src/basic/INDEXLIST-Hip.cpp @@ -55,7 +55,7 @@ __device__ void grid_scan(const int block_id, unsigned* block_readys) { const bool first_block = (block_id == 0); - const bool last_block = (block_id == gridDim.x-1); + const bool last_block = (block_id == static_cast(gridDim.x-1)); const bool last_thread = (threadIdx.x == block_size-1); const bool last_warp = (threadIdx.x >= block_size - warp_size); const int warp_index = (threadIdx.x % warp_size); @@ -82,7 +82,7 @@ __device__ void grid_scan(const int block_id, BlockScan().exclusive_scan(val, exclusive, Index_type{0}, s_temp_storage.block_scan_storage); __syncthreads(); - for (int ti = 0; ti < items_per_thread; ++ti) { + for (size_t ti = 0; ti < items_per_thread; ++ti) { inclusive[ti] = exclusive[ti] + val[ti]; } @@ -195,13 +195,13 @@ __device__ void grid_scan(const int block_id, __syncthreads(); Index_type prev_grid_count = s_temp_storage.prev_grid_count; - for (int ti = 0; ti < items_per_thread; ++ti) { + for (size_t ti = 0; ti < items_per_thread; ++ti) { exclusive[ti] = prev_grid_count + exclusive[ti]; inclusive[ti] = prev_grid_count + inclusive[ti]; } if (last_block) { - for (int i = threadIdx.x; i < gridDim.x-1; i += block_size) { + for (unsigned i = threadIdx.x; i < gridDim.x-1; i += block_size) { while (atomicCAS(&block_readys[i], 2u, 0u) != 2u); } } @@ -225,7 +225,7 @@ __global__ void indexlist(Real_ptr x, Index_type vals[items_per_thread]; - for (Index_type ti = 0; ti < items_per_thread; ++ti) { + for (size_t ti = 0; ti < items_per_thread; ++ti) { Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x; Index_type val = 0; if (i < iend) { @@ -241,7 +241,7 @@ __global__ void indexlist(Real_ptr x, grid_scan( block_id, vals, exclusives, inclusives, block_counts, grid_counts, block_readys); - for (Index_type ti = 0; ti < items_per_thread; ++ti) { + for (size_t ti = 0; ti < items_per_thread; ++ti) { Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x; Index_type exclusive = exclusives[ti]; Index_type inclusive = inclusives[ti]; From 851dce431a008c275499856edff3a34997d412f0 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 1 Apr 2022 10:12:49 -0700 Subject: [PATCH 297/392] Add tunings to readme --- README.md | 414 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 229 insertions(+), 185 deletions(-) diff --git a/README.md b/README.md index 161e4c77b..a85ad0f5f 100644 --- a/README.md +++ b/README.md @@ -12,28 +12,32 @@ RAJA Performance Suite [![Build Status](https://travis-ci.org/LLNL/RAJAPerf.svg?branch=develop)](https://travis-ci.org/LLNL/RAJAPerf) -The RAJA Performance Suite is designed to explore performance of loop-based +The RAJA Performance Suite is designed to explore performance of loop-based computational kernels found in HPC applications. Specifically, it can be -used to assess and monitor runtime performance of kernels implemented using -[RAJA] C++ performance portability abstractions and compare those to variants -implemented using common parallel programming models, such as OpenMP and CUDA, +used to assess and monitor runtime performance of kernels implemented using +[RAJA] C++ performance portability abstractions and compare those to variants +implemented using common parallel programming models, such as OpenMP and CUDA, directly. Some important terminology used in the Suite includes: * `Kernel` is a distinct loop-based computation that appears in the Suite in - multiple variants (or implementations), each of which performs the same + multiple variants (or implementations), each of which performs the same computation. - * `Variant` is a particular implementation of a kernel in the Suite, + * `Variant` is an implementation or set of implementations of a kernel in the + Suite that share the same approach/abstraction and programming model, such as baseline OpenMP, RAJA OpenMP, etc. + * `Tuning` is a particular implementation of a variant of a kernel in the + Suite, such as gpu block size 128, gpu block size 256, etc. * `Group` is a collection of kernels in the Suite that are grouped together - because they originate from the same source, such as a specific benchmark + because they originate from the same source, such as a specific benchmark suite. Each kernel in the Suite appears in multiple RAJA and non-RAJA (i.e., baseline) -variants using parallel programming models that RAJA supports. The kernels -originate from various HPC benchmark suites and applications. For example, -the "Stream" group contains kernels from the Babel Stream benchmark, the "Apps" -group contains kernels extracted from real scientific computing applications, -and so forth. +variants using parallel programming models that RAJA supports. Some kernels have +multiple tunings of a variant to explore some of the parametrization that the +programming model supports. The kernels originate from various HPC benchmark +suites and applications. For example, the "Stream" group contains kernels from +the Babel Stream benchmark, the "Apps" group contains kernels extracted from +real scientific computing applications, and so forth. The suite can be run as a single process or with multiple processes when configured with MPI support. Running with MPI in the same configuration used @@ -61,7 +65,7 @@ Table of Contents # Building the Suite -To build the Suite, you must first obtain a copy of the source code by cloning +To build the Suite, you must first obtain a copy of the source code by cloning the GitHub repository. For example, ``` @@ -70,13 +74,13 @@ the GitHub repository. For example, > git clone --recursive https://github.com/llnl/RAJAPerf.git ``` -The repository will reside in a `RAJAPerf` sub-directory in the directory into +The repository will reside in a `RAJAPerf` sub-directory in the directory into which is was cloned. -The Performance Suite has two Git submodules, [RAJA] and the CMake-based [BLT] +The Performance Suite has two Git submodules, [RAJA] and the CMake-based [BLT] build system. The `--recursive` option tells Git to clone the submodules as well as any submodules that they use. If you switch to a different branch -in your working copy of the repository, you should update the submodules to +in your working copy of the repository, you should update the submodules to make sure you have the right versions of them for the branch. For example, ``` @@ -85,18 +89,18 @@ make sure you have the right versions of them for the branch. For example, > git submodule update --recursive ``` -Note that the `--recursive` option will update submodules within submodules, +Note that the `--recursive` option will update submodules within submodules, similar to usage with the `git clone` as described above. RAJA and the Performance Suite are built together using the same CMake configuration. For convenience, we include scripts in the `scripts` -directory that invoke corresponding configuration files (CMake cache files) -in the RAJA submodule. For example, the `scripts/lc-builds` directory +directory that invoke corresponding configuration files (CMake cache files) +in the RAJA submodule. For example, the `scripts/lc-builds` directory contains scripts that show how we build code for testing on platforms in -the Lawrence Livermore Computing Center. Each build script creates a -descriptively-named build space directory in the top-level Performance Suite -directory and runs CMake with a configuration appropriate for the platform and -compilers used. After CMake completes, enter the build directory and type +the Lawrence Livermore Computing Center. Each build script creates a +descriptively-named build space directory in the top-level Performance Suite +directory and runs CMake with a configuration appropriate for the platform and +compilers used. After CMake completes, enter the build directory and type `make` (or `make -j ` for a parallel build using N processor cores; if you omit the number of cores, the code will build in parallel using all available cores on the node you are running on) to compile the code. For example, @@ -107,7 +111,7 @@ cores on the node you are running on) to compile the code. For example, > make -j ``` -The build scripts and associated CMake `host-config` files in RAJA are +The build scripts and associated CMake `host-config` files in RAJA are useful sources of information for building the Suite on various platforms. For example, they show how to enable specific back-end kernel variants and compiler options we use for testing. @@ -124,9 +128,9 @@ options from there. For example, : The provided configurations will only build the Performance Suite code by default; i.e., it will not build any RAJA test or example codes. If you -want to build the RAJA tests, for example, to verify your build of RAJA is +want to build the RAJA tests, for example, to verify your build of RAJA is working properly, just pass the `-DENABLE_TESTS=On` option to CMake, either -on the command line if you run CMake directly or edit the script you are +on the command line if you run CMake directly or edit the script you are running to do this. Then, when the build completes, you can type `make test` to run the RAJA tests. @@ -153,35 +157,59 @@ options. For example, > make -j ``` +## Building with specific GPU block size tunings + +Some of the provided configurations will build the Performance Suite with +GPU support enabled. This will build with the default GPU block size tuning for +each kernel. For example, + +``` +> ./scripts/blueos_nvcc_clang.sh 10.2.89 sm_70 10.0.1 +> cd build_lc_blueos-nvcc10.2.89-sm_70-clang10.0.1 +> make -j +``` + +Using a specific set of GPU block sizes is done by by passing the +`-DRAJA_PERFSUITE_GPU_BLOCKSIZES=` option to CMake via the +`-DMPI_CXX_COMPILER=/path/to/mpic++` option to CMake in addition to other CMake +options. For example, + +``` +> mkdir my-gpu-build +> cd my-gpu-build +> cmake -DRAJA_PERFSUITE_GPU_BLOCKSIZES=64,128,256,512,1024 ../ +> make -j +``` + * * * # Running the Suite -The Suite is run by invoking the executable in the `bin` sub-directory in the +The Suite is run by invoking the executable in the `bin` sub-directory in the build space directory. For example, if you provide no command line options, ``` > ./bin/raja-perf.exe ``` -the entire Suite (all kernels and variants) will execute in their default -configurations. How the Suite will run and some details about each kernel +the entire Suite (all kernels and variants) will execute in their default +configurations. How the Suite will run and some details about each kernel will appear on the screen before it is run. Kernel detail information will also appear in a run report file generated in your run directory -after the Suite executes. You can pass the ''--dryrun'' option along with +after the Suite executes. You can pass the ''--dryrun'' option along with any other runtime options to see a summary of how the Suite will execute without actually running it. The Suite can be run in a variety of ways via options passed to the executable. -For example, you can run subsets of kernels and variants by specifying -variants, groups, or individual kernels explicitly. Other configuration -options to set problem sizes, number of times each kernel is run, etc. can -also be specified. You build the code once and use scripts or other mechanisms +For example, you can run subsets of kernels and variants by specifying +variants, groups, or individual kernels explicitly. Other configuration +options to set problem sizes, number of times each kernel is run, etc. can +also be specified. You build the code once and use scripts or other mechanisms to run the Suite in different ways for analyses you want to perform. All options appear in a 'long form' with a double hyphen prefix (i.e., '--'). -Some options are available in a one or two character 'short form' with a -single hyphen prefix (i.e., '-') for convenience. To see available options +Some options are available in a one or two character 'short form' with a +single hyphen prefix (i.e., '-') for convenience. To see available options along with a brief description of each, pass the `--help` or `-h` option: ``` @@ -194,9 +222,9 @@ or > ./bin/raja-perf.exe -h ``` -Lastly, the program will generate a summary of provided input if it is given +Lastly, the program will generate a summary of provided input if it is given input that the code does not know how to parse. Ill-formed input will be noted -in the summary output. Hopefully, this will make it easy for users to correct +in the summary output. Hopefully, this will make it easy for users to correct erroneous usage, such as mis-spelled option names. ## Running with MPI @@ -207,7 +235,7 @@ For example, ``` > srun -n 2 ./bin/raja-perf.exe ``` -the entire Suite (all kernels and variants) will execute in their default +the entire Suite (all kernels and variants) will execute in their default configurations on each of the 2 ranks. The kernel information output shows how each kernel is run on each rank. The total problem size across all MPI ranks can be calculated by multiplying the number of MPI ranks by the problem @@ -217,24 +245,24 @@ doing an MPI barrier, and then stopping the timer. ## Important note - * The OpenMP target offload variants of the kernels in the Suite are a + * The OpenMP target offload variants of the kernels in the Suite are a work-in-progress since the RAJA OpenMP target offload back-end is also - a work-in-progress. If you configure them to build, they can be run with - the executable `./bin/raja-perf-omptarget.exe` which is distinct from + a work-in-progress. If you configure them to build, they can be run with + the executable `./bin/raja-perf-omptarget.exe` which is distinct from the one described above. At the time the OpenMP target offload variants were developed, it was not possible for them to co-exist in the same executable as the CUDA variants, for example. In the future, the build system may - be reworked so that the OpenMP target variants can be run from the same + be reworked so that the OpenMP target variants can be run from the same executable as the other variants. * * * # Generated output -When the Suite is run, several output files are generated that contain -data describing the run. The file names start with the file prefix +When the Suite is run, several output files are generated that contain +data describing the run. The file names start with the file prefix provided via a command line option in the output directory, also specified -on the command line. If no such options are provided, files will be located +on the command line. If no such options are provided, files will be located in the current run directory and be named `RAJAPerf-*`, where '*' is a string indicating the contents of the file. @@ -247,14 +275,14 @@ Currently, there are five files generated: 5. Kernel -- Basic information about each kernel that is run, which is the same for each variant of the kernel that is run. See description of output information below. -All output files are text files. Other than the checksum file, all are in +All output files are text files. Other than the checksum file, all are in 'csv' format for easy processing by common tools and generating plots. ## Kernel information definitions Information about kernels that are run is located in the ''RAJAPerf-kernels.csv'' file. This information is for each process individually, so when running with MPI the total problem size aggregated across all ranks is the number of ranks times the problem size shown in the kernel information. Kernel information includes the following: -1. Kernel name -- Format is group name followed by kernel name, separated by an underscore. +1. Kernel name -- Format is group name followed by kernel name, separated by an underscore. 2. Feature -- RAJA feature(s) exercised in RAJA variants of kernel. 3. Problem size -- Size of the problem represented by a kernel. Please see notes below for more information. 4. Reps -- Number of times a kernel runs in a single pass through the Suite. @@ -265,41 +293,41 @@ Information about kernels that are run is located in the ''RAJAPerf-kernels.csv' ### Notes about 'problem size' - * Problem size is always ouput per process/MPI rank. To get the total problem + * Problem size is always output per process/MPI rank. To get the total problem size across all ranks when running with MPI multiply the problem size by the number of MPI ranks. - * The Suite uses three notions of problem size for each kernel: 'default', - 'target', and 'actual'. Default is the 'default' problem size defined for a - kernel and the size that will be run if no runtime options are + * The Suite uses three notions of problem size for each kernel: 'default', + 'target', and 'actual'. Default is the 'default' problem size defined for a + kernel and the size that will be run if no runtime options are provided to run a different size. Target is the desired problem size to run based on default settings and alterations to that if input is provided to - change the default. Actual is the problem size that is run based on how + change the default. Actual is the problem size that is run based on how each kernel calculates this. * The concept of problem size is subjective and can be interpreted differently - depending on the kernel structure and what one is trying to measure. For - example, problem size could refer to the amount of data needed to be stored - in memory to run the problem, or it could refer to the amount of parallel - work that is possible, etc. - * We employ the following, admittedly loose definition, which depends on the - particular kernel structure. Of all the 'loop structures' (e.g., single - loop, nested loops, etc.) that are run for a kernel (note that some kernels - run multiple loops, possibly with different sizes or loop structures), - problem size refers to the size of the data set required to generate the - kernel result. The interpretation of this and the definition of problem - size for each kernel in the suite is determined by the kernel developer + depending on the kernel structure and what one is trying to measure. For + example, problem size could refer to the amount of data needed to be stored + in memory to run the problem, or it could refer to the amount of parallel + work that is possible, etc. + * We employ the following, admittedly loose definition, which depends on the + particular kernel structure. Of all the 'loop structures' (e.g., single + loop, nested loops, etc.) that are run for a kernel (note that some kernels + run multiple loops, possibly with different sizes or loop structures), + problem size refers to the size of the data set required to generate the + kernel result. The interpretation of this and the definition of problem + size for each kernel in the suite is determined by the kernel developer and team discussion. -Here are a few examples to give a better sense of how we determine problem +Here are a few examples to give a better sense of how we determine problem size for various kernels in the Suite. Vector addition. ```cpp for (int i = 0; i < 0; i < N; ++i) { - c[i] = a[i] + b[i]; + c[i] = a[i] + b[i]; } ``` -The problem size for this kernel is 'N', the loop length. Note that this -happens to match the size of the vectors a, b, c and the total amount of +The problem size for this kernel is 'N', the loop length. Note that this +happens to match the size of the vectors a, b, c and the total amount of parallel work in the kernel. This is common for simple, data parallel kernels. Matrix-vector multiplication. @@ -317,7 +345,7 @@ work is N_r, the number of rows in the matrix and the length of the vector b. Matrix-matrix multiplication. ```cpp -for (int i = 0; i < N_i; ++i) { +for (int i = 0; i < N_i; ++i) { for (int j = 0; j < N_j; ++j) { A[i][j] = 0; for (int k = 0; k < N_k; ++k) { @@ -328,19 +356,19 @@ for (int i = 0; i < N_i; ++i) { ``` Here, we are multiplying matrix B (N_i x N_k) and matrix C (N_k x N_j) and storing the result in matrix A (N_i X N_j). Problem size could be chosen to -be the maximum number of entries in matrix B or C. We choose the size of -matrix A (N_i * N_j), which is more closely aligned with the number of -independent operations (i.e., the amount of parallel work) in the kernels. +be the maximum number of entries in matrix B or C. We choose the size of +matrix A (N_i * N_j), which is more closely aligned with the number of +independent operations (i.e., the amount of parallel work) in the kernels. * * * -# Adding kernels and variants +# Adding kernels, variants, and Tunings -This section describes how to add new kernels and/or variants to the Suite. -*Group* and *feature* modifications are not required unless a new group or -exercised RAJA feature is added when a new kernel is introduced. The -information in this section also provides insight into how the performance +This section describes how to add new kernels, variants and/or tunings to the +Suite. *Group* and *feature* modifications are not required unless a new group +or exercised RAJA feature is added when a new kernel is introduced. The +information in this section also provides insight into how the performance Suite operates. It is essential that the appropriate targets are updated in the appropriate @@ -351,7 +379,7 @@ be compiled. Adding a new kernel to the Suite involves three main steps: -1. Add a unique kernel ID and a unique kernel name to the Suite. +1. Add a unique kernel ID and a unique kernel name to the Suite. 2. If the kernel is part of a new kernel group or exercises a new RAJA feature, also add a unique group ID and name for the group. Similarly, if a new RAJA feature is exercised by a new kernel. 3. Implement a kernel class that contains all operations needed to run it, with source files organized as described below. @@ -360,14 +388,14 @@ These steps are described in the following sections. ### Add the kernel ID and name -Two key pieces of information identify a kernel: the group in which it +Two key pieces of information identify a kernel: the group in which it resides and the name of the kernel itself. For concreteness, we describe -how to add a kernel "FOO" that lives in the kernel group "Basic". The files +how to add a kernel "FOO" that lives in the kernel group "Basic". The files `RAJAPerfSuite.hpp` and `RAJAPerfSuite.cpp` in the `src/common` directory -define enumeration values and arrays of string names for the kernels, -respectively. +define enumeration values and arrays of string names for the kernels, +respectively. -First, add an enumeration value identifier for the kernel, that is unique +First, add an enumeration value identifier for the kernel, that is unique among all kernels, in the enum 'KernelID' in the header file `RAJAPerfSuite.hpp`: ```cpp @@ -381,7 +409,7 @@ enum KernelID { Note: the enumeration value for the kernel is the group name followed by the kernel name, separated by an underscore. It is important to follow this convention so that the kernel works properly with the Performance -Suite machinery. +Suite machinery. Second, add the kernel name to the array of strings `KernelNames` in the file `RAJAPerfSuite.cpp`: @@ -406,7 +434,7 @@ and IDs in alphabetical order to make the organization clear. ### Add new group if needed If a kernel is added as part of a new group of kernels in the Suite, a -new value must be added to the `GroupID` enum in the header file +new value must be added to the `GroupID` enum in the header file `RAJAPerfSuite.hpp` and an associated group string name must be added to the `GroupNames` array of strings in the file `RAJAPerfSuite.cpp`. Again, the enumeration values and items in the string array must be kept @@ -417,32 +445,32 @@ Adding a new RAJA feature is similar. ### Add the kernel class -Each kernel in the Suite is implemented in a class whose header and +Each kernel in the Suite is implemented in a class whose header and implementation files live in the directory named for the group in which the kernel lives. The kernel class is responsible for implementing -all operations needed to manage data, execute, and record execution timing and -checksum information for each variant of the kernel. To properly plug in to -the Performance Suite framework, the kernel class must be a subclass of the -`KernelBase` base class that defines the interface for kernels in the Suite. +all operations needed to manage data, execute, and record execution timing and +checksum information for each variant and tuning of the kernel. To properly plug +in to the Performance Suite framework, the kernel class must be a subclass of +the `KernelBase` base class that defines the interface for kernels in the Suite. -Continuing with our example, we add a 'FOO' class header file `FOO.hpp`, -and multiple implementation files described in the following sections: +Continuing with our example, we add a 'FOO' class header file `FOO.hpp`, +and multiple implementation files described in the following sections: * `FOO.cpp` contains the methods to setup and teardown the memory for the - 'FOO' kernel, and compute and record a checksum on the result after it - executes. It also specifies kernel information in the kernel class + 'FOO' kernel, and compute and record a checksum on the result after it + executes. It also specifies kernel information in the kernel class constructor. - * `FOO-Seq.cpp` contains sequential CPU variants of the kernel. - * `FOO-OMP.cpp` contains OpenMP CPU multithreading variants of the kernel. - * `FOO-OMPTarget.cpp` contains OpenMP target offload variants of the kernel. - * `FOO-Cuda.cpp` contains CUDA GPU variants of the kernel. - * `FOO-Hip.cpp` contains HIP GPU variants of the kernel. + * `FOO-Seq.cpp` contains sequential CPU variants and tunings of the kernel. + * `FOO-OMP.cpp` contains OpenMP CPU multithreading variants and tunings of the kernel. + * `FOO-OMPTarget.cpp` contains OpenMP target offload variants and tunings of the kernel. + * `FOO-Cuda.cpp` contains CUDA GPU variants and tunings of the kernel. + * `FOO-Hip.cpp` contains HIP GPU variants and tunings of the kernel. All kernels in the Suite follow the same implementation pattern. Inspect the files for any kernel to understand the overall organization. - + Note: if a new execution back-end variant is added that is not listed here, -that variant should go in the file `FOO-.cpp`. Keeping the +that variant should go in the file `FOO-.cpp`. Keeping the back-end variants in separate files helps to understand compiler optimizations when looking at generated assembly code, for example. @@ -470,11 +498,11 @@ Here is what a header file for the FOO kernel object should look like: #include "common/KernelBase.hpp" -namespace rajaperf +namespace rajaperf { class RunParams; // Forward declaration for ctor arg. -namespace basic +namespace basic { class FOO : public KernelBase @@ -485,15 +513,15 @@ public: ~FOO(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); + void setUp(VariantID vid, size_t tuning_idx); + void updateChecksum(VariantID vid, size_t tuning_idx); + void tearDown(VariantID vid, size_t tuning_idx); - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); - void runHipVariant(VariantID vid); - void runOpenMPTargetVariant(VariantID vid); + void runSeqVariant(VariantID vid, size_t tuning_idx); + void runOpenMPVariant(VariantID vid, size_t tuning_idx); + void runCudaVariant(VariantID vid, size_t tuning_idx); + void runHipVariant(VariantID vid, size_t tuning_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tuning_idx); private: // Kernel-specific data (pointers, scalars, etc.) as needed... @@ -506,39 +534,39 @@ private: ``` The kernel object header has a uniquely-named header file include guard and -the class is nested within the `rajaperf` and `basic` namespaces. The +the class is nested within the `rajaperf` and `basic` namespaces. The constructor takes a reference to a `RunParams` object, which contains the -input parameters for running the Suite -- we'll say more about this later. -The methods that take a variant ID argument must be provided as they are -pure virtual in the KernelBase class. Their names are descriptive of what they -do and we'll provide more details about them when we describe the class -implementation next. +input parameters for running the Suite -- we'll say more about this later. +The methods that take a variant ID and tuning index arguments must be provided +as they are pure virtual in the KernelBase class. Their names are descriptive of +what they do and we'll provide more details about them when we describe the +class implementation next. #### Kernel class implementation -Each kernel in the Suite follows a similar implementation pattern for -consistency and ease of analysis and understanding. Here, we describe several -key steps and conventions that must be followed to ensure that all kernels +Each kernel in the Suite follows a similar implementation pattern for +consistency and ease of analysis and understanding. Here, we describe several +key steps and conventions that must be followed to ensure that all kernels interact with the performance Suite machinery in the same way: 1. Initialize the `KernelBase` class object with `KernelID` and `RunParams` object passed to the FOO class constructor. -2. In the class constructor, define kernel information. This includes: default problem size, default run repetition count, iterations per rep, kernels per rep, bytes per rep, FLOPs per rep, the RAJA features used by the kernel, and kernel variants defined (i.e., implemented) by calling the appropriate members in the `KernelBase`` class. See the *.cpp file for any existing kernel in the suite for examples of how this is done. +2. In the class constructor, define kernel information. This includes: default problem size, default run repetition count, iterations per rep, kernels per rep, bytes per rep, FLOPs per rep, the RAJA features used by the kernel, and kernel variants defined (i.e., implemented) by calling the appropriate members in the `KernelBase` class. See the *.cpp file for any existing kernel in the suite for examples of how this is done. Note that tuning names are added in step 6. 3. Implement data allocation and initialization operations for each kernel variant in the `setUp` method. 4. Compute the checksum for each variant in the `updateChecksum` method. 5. Deallocate and reset any data that will be allocated and/or initialized in subsequent kernel executions in the `tearDown` method. -6. Implement kernel execution for the associated variants in the `run*Variant` methods in the proper source files. +6. Implement kernel execution for the associated variants and tunings in the `run*Variant` methods in the proper source files. Add tuning names for the tunings of each variant by overriding the `KernelBase` methods `set*TuningDefinitions`. Note that this is not necessary if there is only one tuning. ##### Constructor and destructor It is important to note that there will only be one instance of each kernel -class created by the program. Thus, each kernel class constructor and -destructor must only perform operations that are not specific to any kernel +class created by the program. Thus, each kernel class constructor and +destructor must only perform operations that are not specific to any kernel variant. The constructor must pass the kernel ID and RunParams object to the base class `KernelBase` constructor. The body of the constructor must also call -base class methods to set kernel information described above. Note that -the arguments passed to each method are specific to each kernel, in general. +base class methods to set kernel information described above. Note that +the arguments passed to each method are specific to each kernel, in general. This code snippets shows a typical way this looks for a simple single for-loop data parallel kernel. @@ -551,11 +579,11 @@ FOO::FOO(const RunParams& params) // to generate an execution run time value setActualProblemSize( getTargetProblemSize() ); // actual problem size may - // be different than the + // be different than the // default size based on // user-provided run time // options - + setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); setBytesPerRep( ... ); // value set based on data read and written when @@ -580,32 +608,34 @@ owned by the class object as needed. Often, it is empty. ##### setUp() method -The `setUp()` method is responsible for allocating and initializing data -necessary to run the kernel for the variant specified by its variant ID +The `setUp()` method is responsible for allocating and initializing data +necessary to run the kernel for the variant specified by its variant ID argument. For example, a baseline variant may have aligned data allocation to help enable SIMD optimizations, an OpenMP variant may initialize arrays -following a pattern of "first touch" based on how memory and threads are -mapped to CPU cores, a CUDA variant may initialize data in host memory, +following a pattern of "first touch" based on how memory and threads are +mapped to CPU cores, a CUDA variant may initialize data in host memory, which will be copied to device memory when a CUDA variant executes, etc. It is important to use the same data allocation and initialization operations for all kernel variants so that checksums can be compared at the end of a run. -Note: to simplify these operations and help ensure consistency, there exist +Note: to simplify these operations and help ensure consistency, there exist utility methods to allocate, initialize, deallocate, and copy data, and compute checksums defined in the `DataUtils.hpp` `CudaDataUtils.hpp`, `OpenMPTargetDataUtils.hpp`, etc. header files in the `common` directory. ##### run methods -Which files contain which 'run' methods and associated variant implementations -is described above. Each method takes a variant ID argument which identifies -the variant to be run. Each method is also responsible for calling base class -methods to start and stop execution timers when a loop variant is run. A -typical kernel execution code section may look like: +Which files contain which 'run' methods and associated variant and tuning +implementations is described above. Each method takes a variant ID argument +which identifies the variant to be run and a tuning index which identifies +the tuning of the variant to run. Note that the tuning index can be ignored +when there is only one tuning. Each method is also responsible for calling base +class methods to start and stop execution timers when a loop variant is run. +A typical kernel execution code section may look like: ```cpp -void Foo::runSeqVariant(VariantID vid) +void Foo::runSeqVariant(VariantID vid, size_t /*tuning_idx*/) { const Index_type run_reps = getRunReps(); // ... @@ -622,7 +652,7 @@ void Foo::runSeqVariant(VariantID vid) } stopTimer(); - break; + break; } #if defined(RUN_RAJA_SEQ) @@ -631,7 +661,7 @@ void Foo::runSeqVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - // Implementation of Lambda_Seq kernel variant... + // Implementation of Lambda_Seq kernel variant... } stopTimer(); @@ -666,61 +696,66 @@ pattern will ensure all new additions are consistent. Important notes: - * As mentioned earlier, there are multiple source files for each kernel. - The reason for this is that it makes it easier to apply unique compiler - flags to different variants and to manage compilation and linking issues - that arise when some kernel variants are combined in the same translation + * As mentioned earlier, there are multiple source files for each kernel. + The reason for this is that it makes it easier to apply unique compiler + flags to different variants and to manage compilation and linking issues + that arise when some kernel variants are combined in the same translation unit. - * For convenience, we make heavy use of macros to define data declarations + * For convenience, we make heavy use of macros to define data declarations and kernel bodies in the Suite. While seemingly cryptic, this significantly reduces the amount of redundant code required to implement multiple variants - for each kernel and make sure things are the same as much as possible. The - kernel class implementation files in the Suite provide many examples of + for each kernel and make sure things are the same as much as possible. The + kernel class implementation files in the Suite provide many examples of the basic pattern we use. + * We also use macros to define some methods used with GPU block size tunings. + While seemingly cryptic, this significantly reduces the amount of redundant + code required to implement calling and naming each of the multiple tunings + for each kernel and make sure things are the same as much as possible. + ##### updateChecksum() method The `updateChecksum()` method is responsible for adding the checksum -for the current kernel (based on the data the kernel computes) to the -checksum value for the variant of the kernel just executed, which is held -in the KernelBase base class object. +for the current kernel (based on the data the kernel computes) to the +checksum value for the variant and tuning of the kernel just executed, which is +held in the KernelBase base class object. It is important that the checksum be computed in the same way for -each variant of the kernel so that checksums for different variants can be -compared to help identify differences, and potential errors in +each variant of the kernel so that checksums for different variants can be +compared to help identify differences, and potential errors in implementations, compiler optimizations, programming model execution, etc. -Note: to simplify checksum computations and help ensure consistency, there +Note: to simplify checksum computations and help ensure consistency, there are methods to compute checksums, a weighted sum of array values for example, are defined in the `DataUtils.hpp` header file in the `common` directory. ##### tearDown() method The `tearDown()` method frees and/or resets all kernel data that is -allocated and/or initialized in the `setUp()` method execution to prepare for +allocated and/or initialized in the `setUp()` method execution to prepare for other kernel variants run subsequently. ### Add object construction operation -The `Executor` class in the `common` directory is responsible for creating -kernel objects for the kernels to be run based on the Suite input options. -To ensure a new kernel object will be created properly, add a call to its -class constructor based on its `KernelID` in the `getKernelObject()` +The `Executor` class in the `common` directory is responsible for creating +kernel objects for the kernels to be run based on the Suite input options. +To ensure a new kernel object will be created properly, add a call to its +class constructor based on its `KernelID` in the `getKernelObject()` method in the `RAJAPerfSuite.cpp` file. - + ## Adding a variant Each variant in the RAJA Performance Suite is identified by an enumeration value and a string name. Adding a new variant requires adding these two -items similarly to adding those for a kernel as described above. +items similarly to adding those for a kernel as described above. ### Add the variant ID and name -First, add an enumeration value identifier for the variant, that is unique -among all variants, in the enum 'VariantID' in the header file +First, add an enumeration value identifier for the variant, that is unique +among all variants, in the enum 'VariantID' in the header file `RAJAPerfSuite.hpp`: ```cpp @@ -751,14 +786,23 @@ and matching one-to-one). ### Add kernel variant implementations -In the classes containing kernels to which the new variant applies, -add implementations for the variant in the setup, kernel execution, -checksum computation, and teardown methods as needed. Also, make sure to -define the variant for those kernels in the kernel class constructors by -calling `setVariantDefined(NewVariant)` so that the variant can be run. -These operations are described in earlier sections for adding a new kernel +In the classes containing kernels to which the new variant applies, +add implementations for the variant in the setup, kernel execution, +checksum computation, and teardown methods as needed. Also, make sure to +define the variant for those kernels in the kernel class constructors by +calling `setVariantDefined(NewVariant)` so that the variant can be run. +These operations are described in earlier sections for adding a new kernel above. +### Add kernel tuning implementations + +In the classes containing kernels to which the new tuning applies, +add implementations for the tuning in the kernel execution and tuning naming +methods as needed. Note that the tuning indices are determined by the order that +the tuning names are added in the `set*TuningDefinitions` method. Therefore +the `run*Variant` methods should have similar logic in order to run the correct +tuning based on the index. + * * * # Continuous Integration @@ -771,20 +815,20 @@ RAJAPerf Suite shares its Gitlab CI workflow with other projects. The documentat # Contributions -The RAJA Performance Suite is a work-in-progress, with new kernels and variants -added as new features and back-end support are developed in RAJA. We encourage -interested parties to contribute to it so that C++ compiler optimizations and -support for programming models like RAJA continue to improve. +The RAJA Performance Suite is a work-in-progress, with new kernels and variants +added as new features and back-end support are developed in RAJA. We encourage +interested parties to contribute to it so that C++ compiler optimizations and +support for programming models like RAJA continue to improve. The Suite developers follow the [GitFlow](http://nvie.com/posts/a-successful-git-branching-model/) development model. Folks wishing to contribute to the Suite, -should include their work in a feature branch created from the Performance -Suite `develop` branch. Then, create a pull request with the `develop` branch -as the destination when it is ready to be reviewed. The `develop` branch +should include their work in a feature branch created from the Performance +Suite `develop` branch. Then, create a pull request with the `develop` branch +as the destination when it is ready to be reviewed. The `develop` branch contains the latest work in RAJA Performance Suite. Periodically, we merge the develop branch into the `main` branch and tag a new release. -If you would like to contribute to the RAJA Performance Suite, or have -questions about doing so, please contact the maintainer of the Suite listed +If you would like to contribute to the RAJA Performance Suite, or have +questions about doing so, please contact the maintainer of the Suite listed below. * * * @@ -795,7 +839,7 @@ The primary developer/maintainer of the RAJA Performance Suite: * Rich Hornung (hornung1@llnl.gov) -Please see the {RAJA Performance Suite Contributors Page](https://github.com/LLNL/RAJAPerf/graphs/contributors), to see the full list of contributors to the +Please see the {RAJA Performance Suite Contributors Page](https://github.com/LLNL/RAJAPerf/graphs/contributors), to see the full list of contributors to the project. * * * @@ -834,18 +878,18 @@ text in the license header: # External Packages -The RAJA Performance Suite has some external dependencies, which are included +The RAJA Performance Suite has some external dependencies, which are included as Git submodules. These packages are covered by various permissive licenses. A summary listing follows. See the license included with each package for full details. -PackageName: BLT -PackageHomePage: https://github.com/LLNL/blt/ +PackageName: BLT +PackageHomePage: https://github.com/LLNL/blt/ PackageLicenseDeclared: BSD-3-Clause -PackageName: RAJA -PackageHomePage: http://github.com/LLNL/RAJA/ -PackageLicenseDeclared: BSD-3-Clause +PackageName: RAJA +PackageHomePage: http://github.com/LLNL/RAJA/ +PackageLicenseDeclared: BSD-3-Clause * * * From 8fa97ac3650772c3151145efaf75d71bf1c0396b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 1 Apr 2022 10:39:51 -0700 Subject: [PATCH 298/392] Use UNUSED_ARG instead of commenting out tune_idx --- src/algorithm/SORT-Cuda.cpp | 2 +- src/algorithm/SORT-Hip.cpp | 2 +- src/algorithm/SORT-OMP.cpp | 2 +- src/algorithm/SORT-Seq.cpp | 2 +- src/algorithm/SORT.cpp | 4 ++-- src/algorithm/SORT.hpp | 2 +- src/algorithm/SORTPAIRS-Cuda.cpp | 2 +- src/algorithm/SORTPAIRS-Hip.cpp | 2 +- src/algorithm/SORTPAIRS-OMP.cpp | 2 +- src/algorithm/SORTPAIRS-Seq.cpp | 2 +- src/algorithm/SORTPAIRS.cpp | 4 ++-- src/algorithm/SORTPAIRS.hpp | 2 +- src/apps/DEL_DOT_VEC_2D-OMP.cpp | 2 +- src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp | 2 +- src/apps/DEL_DOT_VEC_2D-Seq.cpp | 2 +- src/apps/DEL_DOT_VEC_2D.cpp | 4 ++-- src/apps/DIFFUSION3DPA-OMP.cpp | 2 +- src/apps/DIFFUSION3DPA-OMPTarget.cpp | 2 +- src/apps/DIFFUSION3DPA-Seq.cpp | 2 +- src/apps/DIFFUSION3DPA.cpp | 4 ++-- src/apps/ENERGY-OMP.cpp | 2 +- src/apps/ENERGY-OMPTarget.cpp | 2 +- src/apps/ENERGY-Seq.cpp | 2 +- src/apps/ENERGY.cpp | 4 ++-- src/apps/FIR-OMP.cpp | 2 +- src/apps/FIR-OMPTarget.cpp | 2 +- src/apps/FIR-Seq.cpp | 2 +- src/apps/FIR.cpp | 4 ++-- src/apps/HALOEXCHANGE-OMP.cpp | 2 +- src/apps/HALOEXCHANGE-OMPTarget.cpp | 2 +- src/apps/HALOEXCHANGE-Seq.cpp | 2 +- src/apps/HALOEXCHANGE.cpp | 4 ++-- src/apps/HALOEXCHANGE_FUSED-OMP.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED-Seq.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED.cpp | 4 ++-- src/apps/LTIMES-OMP.cpp | 2 +- src/apps/LTIMES-OMPTarget.cpp | 2 +- src/apps/LTIMES-Seq.cpp | 2 +- src/apps/LTIMES.cpp | 4 ++-- src/apps/LTIMES_NOVIEW-OMP.cpp | 2 +- src/apps/LTIMES_NOVIEW-OMPTarget.cpp | 2 +- src/apps/LTIMES_NOVIEW-Seq.cpp | 2 +- src/apps/LTIMES_NOVIEW.cpp | 4 ++-- src/apps/MASS3DPA-Cuda.cpp | 2 +- src/apps/MASS3DPA-Hip.cpp | 2 +- src/apps/MASS3DPA-OMP.cpp | 2 +- src/apps/MASS3DPA-OMPTarget.cpp | 2 +- src/apps/MASS3DPA-Seq.cpp | 2 +- src/apps/MASS3DPA.cpp | 4 ++-- src/apps/NODAL_ACCUMULATION_3D-OMP.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D-Seq.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D.cpp | 4 ++-- src/apps/PRESSURE-OMP.cpp | 2 +- src/apps/PRESSURE-OMPTarget.cpp | 2 +- src/apps/PRESSURE-Seq.cpp | 2 +- src/apps/PRESSURE.cpp | 4 ++-- src/apps/VOL3D-OMP.cpp | 2 +- src/apps/VOL3D-OMPTarget.cpp | 2 +- src/apps/VOL3D-Seq.cpp | 2 +- src/apps/VOL3D.cpp | 4 ++-- src/apps/WIP-COUPLE.cpp | 4 ++-- src/apps/WIP-COUPLE.hpp | 10 +++++----- src/basic/DAXPY-OMP.cpp | 2 +- src/basic/DAXPY-OMPTarget.cpp | 2 +- src/basic/DAXPY-Seq.cpp | 2 +- src/basic/DAXPY.cpp | 4 ++-- src/basic/DAXPY_ATOMIC-OMP.cpp | 2 +- src/basic/DAXPY_ATOMIC-OMPTarget.cpp | 2 +- src/basic/DAXPY_ATOMIC-Seq.cpp | 2 +- src/basic/DAXPY_ATOMIC.cpp | 4 ++-- src/basic/IF_QUAD-OMP.cpp | 2 +- src/basic/IF_QUAD-OMPTarget.cpp | 2 +- src/basic/IF_QUAD-Seq.cpp | 2 +- src/basic/IF_QUAD.cpp | 4 ++-- src/basic/INIT3-OMP.cpp | 2 +- src/basic/INIT3-OMPTarget.cpp | 2 +- src/basic/INIT3-Seq.cpp | 2 +- src/basic/INIT3.cpp | 4 ++-- src/basic/INIT_VIEW1D-OMP.cpp | 2 +- src/basic/INIT_VIEW1D-OMPTarget.cpp | 2 +- src/basic/INIT_VIEW1D-Seq.cpp | 2 +- src/basic/INIT_VIEW1D.cpp | 4 ++-- src/basic/INIT_VIEW1D_OFFSET-OMP.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-Seq.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET.cpp | 4 ++-- src/basic/MAT_MAT_SHARED-OMP.cpp | 2 +- src/basic/MAT_MAT_SHARED-OMPTarget.cpp | 2 +- src/basic/MAT_MAT_SHARED-Seq.cpp | 2 +- src/basic/MAT_MAT_SHARED.cpp | 4 ++-- src/basic/MULADDSUB-OMP.cpp | 2 +- src/basic/MULADDSUB-OMPTarget.cpp | 2 +- src/basic/MULADDSUB-Seq.cpp | 2 +- src/basic/MULADDSUB.cpp | 4 ++-- src/basic/NESTED_INIT-OMP.cpp | 2 +- src/basic/NESTED_INIT-OMPTarget.cpp | 2 +- src/basic/NESTED_INIT-Seq.cpp | 2 +- src/basic/NESTED_INIT.cpp | 4 ++-- src/basic/PI_ATOMIC-OMP.cpp | 2 +- src/basic/PI_ATOMIC-OMPTarget.cpp | 2 +- src/basic/PI_ATOMIC-Seq.cpp | 2 +- src/basic/PI_ATOMIC.cpp | 4 ++-- src/basic/PI_REDUCE-OMP.cpp | 2 +- src/basic/PI_REDUCE-OMPTarget.cpp | 2 +- src/basic/PI_REDUCE-Seq.cpp | 2 +- src/basic/PI_REDUCE.cpp | 4 ++-- src/basic/REDUCE3_INT-OMP.cpp | 2 +- src/basic/REDUCE3_INT-OMPTarget.cpp | 2 +- src/basic/REDUCE3_INT-Seq.cpp | 2 +- src/basic/REDUCE3_INT.cpp | 4 ++-- src/basic/TRAP_INT-OMP.cpp | 2 +- src/basic/TRAP_INT-OMPTarget.cpp | 2 +- src/basic/TRAP_INT-Seq.cpp | 2 +- src/basic/TRAP_INT.cpp | 4 ++-- src/lcals/DIFF_PREDICT-OMP.cpp | 2 +- src/lcals/DIFF_PREDICT-OMPTarget.cpp | 2 +- src/lcals/DIFF_PREDICT-Seq.cpp | 2 +- src/lcals/DIFF_PREDICT.cpp | 4 ++-- src/lcals/EOS-OMP.cpp | 2 +- src/lcals/EOS-OMPTarget.cpp | 2 +- src/lcals/EOS-Seq.cpp | 2 +- src/lcals/EOS.cpp | 4 ++-- src/lcals/FIRST_DIFF-OMP.cpp | 2 +- src/lcals/FIRST_DIFF-OMPTarget.cpp | 2 +- src/lcals/FIRST_DIFF-Seq.cpp | 2 +- src/lcals/FIRST_DIFF.cpp | 4 ++-- src/lcals/FIRST_MIN-OMP.cpp | 2 +- src/lcals/FIRST_MIN-OMPTarget.cpp | 2 +- src/lcals/FIRST_MIN-Seq.cpp | 2 +- src/lcals/FIRST_MIN.cpp | 4 ++-- src/lcals/FIRST_SUM-OMP.cpp | 2 +- src/lcals/FIRST_SUM-OMPTarget.cpp | 2 +- src/lcals/FIRST_SUM-Seq.cpp | 2 +- src/lcals/FIRST_SUM.cpp | 4 ++-- src/lcals/GEN_LIN_RECUR-OMP.cpp | 2 +- src/lcals/GEN_LIN_RECUR-OMPTarget.cpp | 2 +- src/lcals/GEN_LIN_RECUR-Seq.cpp | 2 +- src/lcals/GEN_LIN_RECUR.cpp | 4 ++-- src/lcals/HYDRO_1D-OMP.cpp | 2 +- src/lcals/HYDRO_1D-OMPTarget.cpp | 2 +- src/lcals/HYDRO_1D-Seq.cpp | 2 +- src/lcals/HYDRO_1D.cpp | 4 ++-- src/lcals/HYDRO_2D-OMP.cpp | 2 +- src/lcals/HYDRO_2D-OMPTarget.cpp | 2 +- src/lcals/HYDRO_2D-Seq.cpp | 2 +- src/lcals/HYDRO_2D.cpp | 4 ++-- src/lcals/INT_PREDICT-OMP.cpp | 2 +- src/lcals/INT_PREDICT-OMPTarget.cpp | 2 +- src/lcals/INT_PREDICT-Seq.cpp | 2 +- src/lcals/INT_PREDICT.cpp | 4 ++-- src/lcals/PLANCKIAN-OMP.cpp | 2 +- src/lcals/PLANCKIAN-OMPTarget.cpp | 2 +- src/lcals/PLANCKIAN-Seq.cpp | 2 +- src/lcals/PLANCKIAN.cpp | 4 ++-- src/lcals/TRIDIAG_ELIM-OMP.cpp | 2 +- src/lcals/TRIDIAG_ELIM-OMPTarget.cpp | 2 +- src/lcals/TRIDIAG_ELIM-Seq.cpp | 2 +- src/lcals/TRIDIAG_ELIM.cpp | 4 ++-- src/polybench/POLYBENCH_2MM-OMP.cpp | 2 +- src/polybench/POLYBENCH_2MM-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_2MM-Seq.cpp | 2 +- src/polybench/POLYBENCH_2MM.cpp | 4 ++-- src/polybench/POLYBENCH_3MM-OMP.cpp | 2 +- src/polybench/POLYBENCH_3MM-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_3MM-Seq.cpp | 2 +- src/polybench/POLYBENCH_3MM.cpp | 4 ++-- src/polybench/POLYBENCH_ADI-OMP.cpp | 2 +- src/polybench/POLYBENCH_ADI-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_ADI-Seq.cpp | 2 +- src/polybench/POLYBENCH_ADI.cpp | 4 ++-- src/polybench/POLYBENCH_ATAX-OMP.cpp | 2 +- src/polybench/POLYBENCH_ATAX-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_ATAX-Seq.cpp | 2 +- src/polybench/POLYBENCH_ATAX.cpp | 4 ++-- src/polybench/POLYBENCH_FDTD_2D-OMP.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D-Seq.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D.cpp | 4 ++-- src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp | 4 ++-- src/polybench/POLYBENCH_GEMM-OMP.cpp | 2 +- src/polybench/POLYBENCH_GEMM-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_GEMM-Seq.cpp | 2 +- src/polybench/POLYBENCH_GEMM.cpp | 4 ++-- src/polybench/POLYBENCH_GEMVER-OMP.cpp | 2 +- src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_GEMVER-Seq.cpp | 2 +- src/polybench/POLYBENCH_GEMVER.cpp | 4 ++-- src/polybench/POLYBENCH_GESUMMV-OMP.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV-Seq.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV.cpp | 4 ++-- src/polybench/POLYBENCH_HEAT_3D-OMP.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D-Seq.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D.cpp | 4 ++-- src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D.cpp | 4 ++-- src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D.cpp | 4 ++-- src/polybench/POLYBENCH_MVT-OMP.cpp | 2 +- src/polybench/POLYBENCH_MVT-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_MVT-Seq.cpp | 2 +- src/polybench/POLYBENCH_MVT.cpp | 4 ++-- src/stream/ADD-OMP.cpp | 2 +- src/stream/ADD-OMPTarget.cpp | 2 +- src/stream/ADD-Seq.cpp | 2 +- src/stream/ADD.cpp | 4 ++-- src/stream/COPY-OMP.cpp | 2 +- src/stream/COPY-OMPTarget.cpp | 2 +- src/stream/COPY-Seq.cpp | 2 +- src/stream/COPY.cpp | 4 ++-- src/stream/DOT-OMP.cpp | 2 +- src/stream/DOT-OMPTarget.cpp | 2 +- src/stream/DOT-Seq.cpp | 2 +- src/stream/DOT.cpp | 4 ++-- src/stream/MUL-OMP.cpp | 2 +- src/stream/MUL-OMPTarget.cpp | 2 +- src/stream/MUL-Seq.cpp | 2 +- src/stream/MUL.cpp | 4 ++-- src/stream/TRIAD-OMP.cpp | 2 +- src/stream/TRIAD-OMPTarget.cpp | 2 +- src/stream/TRIAD-Seq.cpp | 2 +- src/stream/TRIAD.cpp | 4 ++-- 232 files changed, 293 insertions(+), 293 deletions(-) diff --git a/src/algorithm/SORT-Cuda.cpp b/src/algorithm/SORT-Cuda.cpp index abba6336f..1c7a67381 100644 --- a/src/algorithm/SORT-Cuda.cpp +++ b/src/algorithm/SORT-Cuda.cpp @@ -35,7 +35,7 @@ namespace algorithm deallocCudaDeviceData(x); -void SORT::runCudaVariant(VariantID vid, size_t /*tune_idx*/) +void SORT::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/algorithm/SORT-Hip.cpp b/src/algorithm/SORT-Hip.cpp index aa6ab1a03..80f173a00 100644 --- a/src/algorithm/SORT-Hip.cpp +++ b/src/algorithm/SORT-Hip.cpp @@ -35,7 +35,7 @@ namespace algorithm deallocHipDeviceData(x); -void SORT::runHipVariant(VariantID vid, size_t /*tune_idx*/) +void SORT::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/algorithm/SORT-OMP.cpp b/src/algorithm/SORT-OMP.cpp index 0528bcbce..f62a7dbf6 100644 --- a/src/algorithm/SORT-OMP.cpp +++ b/src/algorithm/SORT-OMP.cpp @@ -18,7 +18,7 @@ namespace algorithm { -void SORT::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void SORT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/algorithm/SORT-Seq.cpp b/src/algorithm/SORT-Seq.cpp index 0091efdfc..ebba50994 100644 --- a/src/algorithm/SORT-Seq.cpp +++ b/src/algorithm/SORT-Seq.cpp @@ -18,7 +18,7 @@ namespace algorithm { -void SORT::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void SORT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/algorithm/SORT.cpp b/src/algorithm/SORT.cpp index 6a199b53a..b9722c4d7 100644 --- a/src/algorithm/SORT.cpp +++ b/src/algorithm/SORT.cpp @@ -47,7 +47,7 @@ SORT::~SORT() { } -void SORT::setUp(VariantID vid, size_t /*tune_idx*/) +void SORT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataRandValue(m_x, getActualProblemSize()*getRunReps(), vid); } @@ -57,7 +57,7 @@ void SORT::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize()*getRunReps()); } -void SORT::tearDown(VariantID vid, size_t /*tune_idx*/) +void SORT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_x); diff --git a/src/algorithm/SORT.hpp b/src/algorithm/SORT.hpp index 5999ea637..0670c9dd0 100644 --- a/src/algorithm/SORT.hpp +++ b/src/algorithm/SORT.hpp @@ -50,7 +50,7 @@ class SORT : public KernelBase void runOpenMPVariant(VariantID vid, size_t tune_idx); void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); - void runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) + void runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { getCout() << "\n SORT : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/algorithm/SORTPAIRS-Cuda.cpp b/src/algorithm/SORTPAIRS-Cuda.cpp index c44186ff2..9e2c1ec93 100644 --- a/src/algorithm/SORTPAIRS-Cuda.cpp +++ b/src/algorithm/SORTPAIRS-Cuda.cpp @@ -38,7 +38,7 @@ namespace algorithm deallocCudaDeviceData(i); -void SORTPAIRS::runCudaVariant(VariantID vid, size_t /*tune_idx*/) +void SORTPAIRS::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/algorithm/SORTPAIRS-Hip.cpp b/src/algorithm/SORTPAIRS-Hip.cpp index 884b401e0..f927a15d9 100644 --- a/src/algorithm/SORTPAIRS-Hip.cpp +++ b/src/algorithm/SORTPAIRS-Hip.cpp @@ -38,7 +38,7 @@ namespace algorithm deallocHipDeviceData(i); -void SORTPAIRS::runHipVariant(VariantID vid, size_t /*tune_idx*/) +void SORTPAIRS::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/algorithm/SORTPAIRS-OMP.cpp b/src/algorithm/SORTPAIRS-OMP.cpp index e36ec1466..5fabe18da 100644 --- a/src/algorithm/SORTPAIRS-OMP.cpp +++ b/src/algorithm/SORTPAIRS-OMP.cpp @@ -18,7 +18,7 @@ namespace algorithm { -void SORTPAIRS::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void SORTPAIRS::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/algorithm/SORTPAIRS-Seq.cpp b/src/algorithm/SORTPAIRS-Seq.cpp index 109bc201c..64fee5d1e 100644 --- a/src/algorithm/SORTPAIRS-Seq.cpp +++ b/src/algorithm/SORTPAIRS-Seq.cpp @@ -21,7 +21,7 @@ namespace algorithm { -void SORTPAIRS::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void SORTPAIRS::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/algorithm/SORTPAIRS.cpp b/src/algorithm/SORTPAIRS.cpp index 335ff6e28..df175844e 100644 --- a/src/algorithm/SORTPAIRS.cpp +++ b/src/algorithm/SORTPAIRS.cpp @@ -47,7 +47,7 @@ SORTPAIRS::~SORTPAIRS() { } -void SORTPAIRS::setUp(VariantID vid, size_t /*tune_idx*/) +void SORTPAIRS::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataRandValue(m_x, getActualProblemSize()*getRunReps(), vid); allocAndInitDataRandValue(m_i, getActualProblemSize()*getRunReps(), vid); @@ -59,7 +59,7 @@ void SORTPAIRS::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_i, getActualProblemSize()*getRunReps()); } -void SORTPAIRS::tearDown(VariantID vid, size_t /*tune_idx*/) +void SORTPAIRS::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_x); diff --git a/src/algorithm/SORTPAIRS.hpp b/src/algorithm/SORTPAIRS.hpp index b4a2aa02d..658d3ad4b 100644 --- a/src/algorithm/SORTPAIRS.hpp +++ b/src/algorithm/SORTPAIRS.hpp @@ -49,7 +49,7 @@ class SORTPAIRS : public KernelBase void runOpenMPVariant(VariantID vid, size_t tune_idx); void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); - void runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) + void runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { getCout() << "\n SORTPAIRS : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/apps/DEL_DOT_VEC_2D-OMP.cpp b/src/apps/DEL_DOT_VEC_2D-OMP.cpp index 79ffce156..91250c796 100644 --- a/src/apps/DEL_DOT_VEC_2D-OMP.cpp +++ b/src/apps/DEL_DOT_VEC_2D-OMP.cpp @@ -22,7 +22,7 @@ namespace apps { -void DEL_DOT_VEC_2D::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void DEL_DOT_VEC_2D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp index 742647fef..479ab795e 100644 --- a/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp +++ b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp @@ -51,7 +51,7 @@ namespace apps deallocOpenMPDeviceData(real_zones, did); -void DEL_DOT_VEC_2D::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void DEL_DOT_VEC_2D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/DEL_DOT_VEC_2D-Seq.cpp b/src/apps/DEL_DOT_VEC_2D-Seq.cpp index 23e4803c6..07100da04 100644 --- a/src/apps/DEL_DOT_VEC_2D-Seq.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Seq.cpp @@ -22,7 +22,7 @@ namespace apps { -void DEL_DOT_VEC_2D::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void DEL_DOT_VEC_2D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index b48de0fbc..24121e157 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -69,7 +69,7 @@ DEL_DOT_VEC_2D::~DEL_DOT_VEC_2D() delete m_domain; } -void DEL_DOT_VEC_2D::setUp(VariantID vid, size_t /*tune_idx*/) +void DEL_DOT_VEC_2D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_x, m_array_length, 0.0, vid); allocAndInitDataConst(m_y, m_array_length, 0.0, vid); @@ -92,7 +92,7 @@ void DEL_DOT_VEC_2D::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_div, m_array_length); } -void DEL_DOT_VEC_2D::tearDown(VariantID vid, size_t /*tune_idx*/) +void DEL_DOT_VEC_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; diff --git a/src/apps/DIFFUSION3DPA-OMP.cpp b/src/apps/DIFFUSION3DPA-OMP.cpp index fec74ccde..7d32b2b41 100644 --- a/src/apps/DIFFUSION3DPA-OMP.cpp +++ b/src/apps/DIFFUSION3DPA-OMP.cpp @@ -18,7 +18,7 @@ namespace rajaperf { namespace apps { -void DIFFUSION3DPA::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { +void DIFFUSION3DPA::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/DIFFUSION3DPA-OMPTarget.cpp b/src/apps/DIFFUSION3DPA-OMPTarget.cpp index 7f4273747..16cff1087 100644 --- a/src/apps/DIFFUSION3DPA-OMPTarget.cpp +++ b/src/apps/DIFFUSION3DPA-OMPTarget.cpp @@ -19,7 +19,7 @@ namespace rajaperf { namespace apps { -void DIFFUSION3DPA::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { +void DIFFUSION3DPA::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); switch (vid) { diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index d61151624..a84b4bc61 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -18,7 +18,7 @@ namespace rajaperf { namespace apps { -void DIFFUSION3DPA::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { +void DIFFUSION3DPA::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); DIFFUSION3DPA_DATA_SETUP; diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp index 915f324c1..3844668c6 100644 --- a/src/apps/DIFFUSION3DPA.cpp +++ b/src/apps/DIFFUSION3DPA.cpp @@ -71,7 +71,7 @@ DIFFUSION3DPA::~DIFFUSION3DPA() { } -void DIFFUSION3DPA::setUp(VariantID vid, size_t /*tune_idx*/) +void DIFFUSION3DPA::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_B, int(DPA_Q1D*DPA_D1D), Real_type(1.0), vid); @@ -86,7 +86,7 @@ void DIFFUSION3DPA::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_Y, DPA_D1D*DPA_D1D*DPA_D1D*m_NE); } -void DIFFUSION3DPA::tearDown(VariantID vid, size_t /*tune_idx*/) +void DIFFUSION3DPA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; diff --git a/src/apps/ENERGY-OMP.cpp b/src/apps/ENERGY-OMP.cpp index c617a6e0d..f06c2efe9 100644 --- a/src/apps/ENERGY-OMP.cpp +++ b/src/apps/ENERGY-OMP.cpp @@ -18,7 +18,7 @@ namespace apps { -void ENERGY::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void ENERGY::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/ENERGY-OMPTarget.cpp b/src/apps/ENERGY-OMPTarget.cpp index 97b53d6d4..3027bd25f 100644 --- a/src/apps/ENERGY-OMPTarget.cpp +++ b/src/apps/ENERGY-OMPTarget.cpp @@ -65,7 +65,7 @@ namespace apps deallocOpenMPDeviceData(qq_old, did); \ deallocOpenMPDeviceData(vnewc, did); -void ENERGY::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void ENERGY::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/ENERGY-Seq.cpp b/src/apps/ENERGY-Seq.cpp index b24f08a9b..5bc229c6e 100644 --- a/src/apps/ENERGY-Seq.cpp +++ b/src/apps/ENERGY-Seq.cpp @@ -18,7 +18,7 @@ namespace apps { -void ENERGY::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void ENERGY::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp index 303a6e8b8..9ed11381a 100644 --- a/src/apps/ENERGY.cpp +++ b/src/apps/ENERGY.cpp @@ -68,7 +68,7 @@ ENERGY::~ENERGY() { } -void ENERGY::setUp(VariantID vid, size_t /*tune_idx*/) +void ENERGY::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_e_new, getActualProblemSize(), 0.0, vid); allocAndInitData(m_e_old, getActualProblemSize(), vid); @@ -98,7 +98,7 @@ void ENERGY::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_q_new, getActualProblemSize()); } -void ENERGY::tearDown(VariantID vid, size_t /*tune_idx*/) +void ENERGY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; diff --git a/src/apps/FIR-OMP.cpp b/src/apps/FIR-OMP.cpp index 195cb22d3..7a5415130 100644 --- a/src/apps/FIR-OMP.cpp +++ b/src/apps/FIR-OMP.cpp @@ -19,7 +19,7 @@ namespace apps { -void FIR::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void FIR::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/FIR-OMPTarget.cpp b/src/apps/FIR-OMPTarget.cpp index 7f2f04265..90be7bd3a 100644 --- a/src/apps/FIR-OMPTarget.cpp +++ b/src/apps/FIR-OMPTarget.cpp @@ -46,7 +46,7 @@ namespace apps deallocOpenMPDeviceData(coeff, did); -void FIR::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void FIR::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/FIR-Seq.cpp b/src/apps/FIR-Seq.cpp index 69b58c9c8..3a196a1f1 100644 --- a/src/apps/FIR-Seq.cpp +++ b/src/apps/FIR-Seq.cpp @@ -19,7 +19,7 @@ namespace apps { -void FIR::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void FIR::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp index 3589debaf..8dd25358e 100644 --- a/src/apps/FIR.cpp +++ b/src/apps/FIR.cpp @@ -62,7 +62,7 @@ FIR::~FIR() { } -void FIR::setUp(VariantID vid, size_t /*tune_idx*/) +void FIR::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitData(m_in, getActualProblemSize(), vid); allocAndInitDataConst(m_out, getActualProblemSize(), 0.0, vid); @@ -73,7 +73,7 @@ void FIR::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_out, getActualProblemSize(), checksum_scale_factor ); } -void FIR::tearDown(VariantID vid, size_t /*tune_idx*/) +void FIR::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; diff --git a/src/apps/HALOEXCHANGE-OMP.cpp b/src/apps/HALOEXCHANGE-OMP.cpp index e24d5c294..daa1dbad8 100644 --- a/src/apps/HALOEXCHANGE-OMP.cpp +++ b/src/apps/HALOEXCHANGE-OMP.cpp @@ -18,7 +18,7 @@ namespace apps { -void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/HALOEXCHANGE-OMPTarget.cpp b/src/apps/HALOEXCHANGE-OMPTarget.cpp index 5bab9e060..4c8f1655c 100644 --- a/src/apps/HALOEXCHANGE-OMPTarget.cpp +++ b/src/apps/HALOEXCHANGE-OMPTarget.cpp @@ -51,7 +51,7 @@ namespace apps } -void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/HALOEXCHANGE-Seq.cpp b/src/apps/HALOEXCHANGE-Seq.cpp index 65405741a..755a47390 100644 --- a/src/apps/HALOEXCHANGE-Seq.cpp +++ b/src/apps/HALOEXCHANGE-Seq.cpp @@ -18,7 +18,7 @@ namespace apps { -void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/HALOEXCHANGE.cpp b/src/apps/HALOEXCHANGE.cpp index 3f96ee337..890fcf0a9 100644 --- a/src/apps/HALOEXCHANGE.cpp +++ b/src/apps/HALOEXCHANGE.cpp @@ -104,7 +104,7 @@ HALOEXCHANGE::~HALOEXCHANGE() { } -void HALOEXCHANGE::setUp(VariantID vid, size_t /*tune_idx*/) +void HALOEXCHANGE::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { m_vars.resize(m_num_vars, nullptr); for (Index_type v = 0; v < m_num_vars; ++v) { @@ -139,7 +139,7 @@ void HALOEXCHANGE::updateChecksum(VariantID vid, size_t tune_idx) } } -void HALOEXCHANGE::tearDown(VariantID vid, size_t /*tune_idx*/) +void HALOEXCHANGE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { for (int l = 0; l < s_num_neighbors; ++l) { deallocData(m_buffers[l]); diff --git a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp b/src/apps/HALOEXCHANGE_FUSED-OMP.cpp index 626074e02..0400c20b0 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-OMP.cpp @@ -18,7 +18,7 @@ namespace apps { -void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp index 674cc73d7..7c465681c 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp @@ -87,7 +87,7 @@ namespace apps delete[] h_unpack_ptrs; -void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp b/src/apps/HALOEXCHANGE_FUSED-Seq.cpp index fc1213b44..984aaf724 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Seq.cpp @@ -18,7 +18,7 @@ namespace apps { -void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/HALOEXCHANGE_FUSED.cpp b/src/apps/HALOEXCHANGE_FUSED.cpp index f95be24fb..406cc654b 100644 --- a/src/apps/HALOEXCHANGE_FUSED.cpp +++ b/src/apps/HALOEXCHANGE_FUSED.cpp @@ -104,7 +104,7 @@ HALOEXCHANGE_FUSED::~HALOEXCHANGE_FUSED() { } -void HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t /*tune_idx*/) +void HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { m_vars.resize(m_num_vars, nullptr); for (Index_type v = 0; v < m_num_vars; ++v) { @@ -139,7 +139,7 @@ void HALOEXCHANGE_FUSED::updateChecksum(VariantID vid, size_t tune_idx) } } -void HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t /*tune_idx*/) +void HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { for (int l = 0; l < s_num_neighbors; ++l) { deallocData(m_buffers[l]); diff --git a/src/apps/LTIMES-OMP.cpp b/src/apps/LTIMES-OMP.cpp index 995351ab4..91d0faeac 100644 --- a/src/apps/LTIMES-OMP.cpp +++ b/src/apps/LTIMES-OMP.cpp @@ -18,7 +18,7 @@ namespace apps { -void LTIMES::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void LTIMES::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/LTIMES-OMPTarget.cpp b/src/apps/LTIMES-OMPTarget.cpp index f641082b4..e89e6cbfa 100644 --- a/src/apps/LTIMES-OMPTarget.cpp +++ b/src/apps/LTIMES-OMPTarget.cpp @@ -36,7 +36,7 @@ namespace apps deallocOpenMPDeviceData(psidat, did); -void LTIMES::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void LTIMES::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/LTIMES-Seq.cpp b/src/apps/LTIMES-Seq.cpp index 2a27cadf5..92fd7c319 100644 --- a/src/apps/LTIMES-Seq.cpp +++ b/src/apps/LTIMES-Seq.cpp @@ -18,7 +18,7 @@ namespace apps { -void LTIMES::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void LTIMES::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp index 29f26896c..ede451a0a 100644 --- a/src/apps/LTIMES.cpp +++ b/src/apps/LTIMES.cpp @@ -83,7 +83,7 @@ LTIMES::~LTIMES() { } -void LTIMES::setUp(VariantID vid, size_t /*tune_idx*/) +void LTIMES::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_phidat, int(m_philen), Real_type(0.0), vid); allocAndInitData(m_elldat, int(m_elllen), vid); @@ -95,7 +95,7 @@ void LTIMES::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_phidat, m_philen, checksum_scale_factor ); } -void LTIMES::tearDown(VariantID vid, size_t /*tune_idx*/) +void LTIMES::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; diff --git a/src/apps/LTIMES_NOVIEW-OMP.cpp b/src/apps/LTIMES_NOVIEW-OMP.cpp index cc1f95168..e41853651 100644 --- a/src/apps/LTIMES_NOVIEW-OMP.cpp +++ b/src/apps/LTIMES_NOVIEW-OMP.cpp @@ -18,7 +18,7 @@ namespace apps { -void LTIMES_NOVIEW::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void LTIMES_NOVIEW::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/LTIMES_NOVIEW-OMPTarget.cpp b/src/apps/LTIMES_NOVIEW-OMPTarget.cpp index 85ae2662e..ca49c8859 100644 --- a/src/apps/LTIMES_NOVIEW-OMPTarget.cpp +++ b/src/apps/LTIMES_NOVIEW-OMPTarget.cpp @@ -36,7 +36,7 @@ namespace apps deallocOpenMPDeviceData(psidat, did); -void LTIMES_NOVIEW::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void LTIMES_NOVIEW::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/LTIMES_NOVIEW-Seq.cpp b/src/apps/LTIMES_NOVIEW-Seq.cpp index c1bff6c2a..7da062715 100644 --- a/src/apps/LTIMES_NOVIEW-Seq.cpp +++ b/src/apps/LTIMES_NOVIEW-Seq.cpp @@ -18,7 +18,7 @@ namespace apps { -void LTIMES_NOVIEW::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void LTIMES_NOVIEW::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp index db4c99ee4..c0c0f7413 100644 --- a/src/apps/LTIMES_NOVIEW.cpp +++ b/src/apps/LTIMES_NOVIEW.cpp @@ -82,7 +82,7 @@ LTIMES_NOVIEW::~LTIMES_NOVIEW() { } -void LTIMES_NOVIEW::setUp(VariantID vid, size_t /*tune_idx*/) +void LTIMES_NOVIEW::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_phidat, int(m_philen), Real_type(0.0), vid); allocAndInitData(m_elldat, int(m_elllen), vid); @@ -94,7 +94,7 @@ void LTIMES_NOVIEW::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_phidat, m_philen, checksum_scale_factor ); } -void LTIMES_NOVIEW::tearDown(VariantID vid, size_t /*tune_idx*/) +void LTIMES_NOVIEW::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp index 5c7b5623e..3f3011fd3 100644 --- a/src/apps/MASS3DPA-Cuda.cpp +++ b/src/apps/MASS3DPA-Cuda.cpp @@ -102,7 +102,7 @@ __global__ void Mass3DPA(const Real_ptr B, const Real_ptr Bt, } } -void MASS3DPA::runCudaVariant(VariantID vid, size_t /*tune_idx*/) { +void MASS3DPA::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); MASS3DPA_DATA_SETUP; diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index cf8fd95a2..01ffead73 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -102,7 +102,7 @@ __global__ void Mass3DPA(const Real_ptr B, const Real_ptr Bt, } } -void MASS3DPA::runHipVariant(VariantID vid, size_t /*tune_idx*/) { +void MASS3DPA::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); MASS3DPA_DATA_SETUP; diff --git a/src/apps/MASS3DPA-OMP.cpp b/src/apps/MASS3DPA-OMP.cpp index b2520e525..49f74774b 100644 --- a/src/apps/MASS3DPA-OMP.cpp +++ b/src/apps/MASS3DPA-OMP.cpp @@ -19,7 +19,7 @@ namespace rajaperf { namespace apps { -void MASS3DPA::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { +void MASS3DPA::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/MASS3DPA-OMPTarget.cpp b/src/apps/MASS3DPA-OMPTarget.cpp index 674bee300..86021b52d 100644 --- a/src/apps/MASS3DPA-OMPTarget.cpp +++ b/src/apps/MASS3DPA-OMPTarget.cpp @@ -20,7 +20,7 @@ namespace rajaperf { namespace apps { -void MASS3DPA::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { +void MASS3DPA::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); switch (vid) { diff --git a/src/apps/MASS3DPA-Seq.cpp b/src/apps/MASS3DPA-Seq.cpp index 55f86d6b7..d276c4a57 100644 --- a/src/apps/MASS3DPA-Seq.cpp +++ b/src/apps/MASS3DPA-Seq.cpp @@ -19,7 +19,7 @@ namespace rajaperf { namespace apps { -void MASS3DPA::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { +void MASS3DPA::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); MASS3DPA_DATA_SETUP; diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp index 9e3069e62..288e7ff82 100644 --- a/src/apps/MASS3DPA.cpp +++ b/src/apps/MASS3DPA.cpp @@ -67,7 +67,7 @@ MASS3DPA::~MASS3DPA() { } -void MASS3DPA::setUp(VariantID vid, size_t /*tune_idx*/) +void MASS3DPA::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_B, int(MPA_Q1D*MPA_D1D), Real_type(1.0), vid); @@ -82,7 +82,7 @@ void MASS3DPA::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_Y, MPA_D1D*MPA_D1D*MPA_D1D*m_NE); } -void MASS3DPA::tearDown(VariantID vid, size_t /*tune_idx*/) +void MASS3DPA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; diff --git a/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp b/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp index 94c690742..baaf60664 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp @@ -20,7 +20,7 @@ namespace apps { -void NODAL_ACCUMULATION_3D::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void NODAL_ACCUMULATION_3D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp b/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp index 45c84c096..f19189c64 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp @@ -43,7 +43,7 @@ namespace apps deallocOpenMPDeviceData(real_zones, did); -void NODAL_ACCUMULATION_3D::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void NODAL_ACCUMULATION_3D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp b/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp index 2fdfa0c64..61449d0f6 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp @@ -20,7 +20,7 @@ namespace apps { -void NODAL_ACCUMULATION_3D::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void NODAL_ACCUMULATION_3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/NODAL_ACCUMULATION_3D.cpp b/src/apps/NODAL_ACCUMULATION_3D.cpp index 895e1e7ba..5fd512fb7 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D.cpp @@ -74,7 +74,7 @@ NODAL_ACCUMULATION_3D::~NODAL_ACCUMULATION_3D() delete m_domain; } -void NODAL_ACCUMULATION_3D::setUp(VariantID vid, size_t /*tune_idx*/) +void NODAL_ACCUMULATION_3D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_x, m_nodal_array_length, 0.0, vid); allocAndInitDataConst(m_vol, m_zonal_array_length, 1.0, vid); @@ -85,7 +85,7 @@ void NODAL_ACCUMULATION_3D::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid].at(tune_idx) += calcChecksum(m_x, m_nodal_array_length, checksum_scale_factor ); } -void NODAL_ACCUMULATION_3D::tearDown(VariantID vid, size_t /*tune_idx*/) +void NODAL_ACCUMULATION_3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; diff --git a/src/apps/PRESSURE-OMP.cpp b/src/apps/PRESSURE-OMP.cpp index 9f9a10047..867e72586 100644 --- a/src/apps/PRESSURE-OMP.cpp +++ b/src/apps/PRESSURE-OMP.cpp @@ -18,7 +18,7 @@ namespace apps { -void PRESSURE::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void PRESSURE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/PRESSURE-OMPTarget.cpp b/src/apps/PRESSURE-OMPTarget.cpp index 10d367e99..8c25f44c7 100644 --- a/src/apps/PRESSURE-OMPTarget.cpp +++ b/src/apps/PRESSURE-OMPTarget.cpp @@ -45,7 +45,7 @@ namespace apps deallocOpenMPDeviceData(vnewc, did); -void PRESSURE::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void PRESSURE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/PRESSURE-Seq.cpp b/src/apps/PRESSURE-Seq.cpp index 64a5ac85e..c2f79e977 100644 --- a/src/apps/PRESSURE-Seq.cpp +++ b/src/apps/PRESSURE-Seq.cpp @@ -18,7 +18,7 @@ namespace apps { -void PRESSURE::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void PRESSURE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp index e47e061cd..df2cb744f 100644 --- a/src/apps/PRESSURE.cpp +++ b/src/apps/PRESSURE.cpp @@ -58,7 +58,7 @@ PRESSURE::~PRESSURE() { } -void PRESSURE::setUp(VariantID vid, size_t /*tune_idx*/) +void PRESSURE::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitData(m_compression, getActualProblemSize(), vid); allocAndInitData(m_bvc, getActualProblemSize(), vid); @@ -77,7 +77,7 @@ void PRESSURE::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_p_new, getActualProblemSize()); } -void PRESSURE::tearDown(VariantID vid, size_t /*tune_idx*/) +void PRESSURE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; diff --git a/src/apps/VOL3D-OMP.cpp b/src/apps/VOL3D-OMP.cpp index 2be10a75b..0f773876c 100644 --- a/src/apps/VOL3D-OMP.cpp +++ b/src/apps/VOL3D-OMP.cpp @@ -20,7 +20,7 @@ namespace apps { -void VOL3D::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void VOL3D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/apps/VOL3D-OMPTarget.cpp b/src/apps/VOL3D-OMPTarget.cpp index 8c9a20cb3..75d8fb2b0 100644 --- a/src/apps/VOL3D-OMPTarget.cpp +++ b/src/apps/VOL3D-OMPTarget.cpp @@ -45,7 +45,7 @@ namespace apps deallocOpenMPDeviceData(vol, did); -void VOL3D::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void VOL3D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = m_domain->fpz; diff --git a/src/apps/VOL3D-Seq.cpp b/src/apps/VOL3D-Seq.cpp index 2435e3030..bb4227280 100644 --- a/src/apps/VOL3D-Seq.cpp +++ b/src/apps/VOL3D-Seq.cpp @@ -20,7 +20,7 @@ namespace apps { -void VOL3D::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void VOL3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = m_domain->fpz; diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp index 4555e261c..fd2ebb5aa 100644 --- a/src/apps/VOL3D.cpp +++ b/src/apps/VOL3D.cpp @@ -71,7 +71,7 @@ VOL3D::~VOL3D() delete m_domain; } -void VOL3D::setUp(VariantID vid, size_t /*tune_idx*/) +void VOL3D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_x, m_array_length, 0.0, vid); allocAndInitDataConst(m_y, m_array_length, 0.0, vid); @@ -92,7 +92,7 @@ void VOL3D::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_vol, m_array_length, checksum_scale_factor ); } -void VOL3D::tearDown(VariantID vid, size_t /*tune_idx*/) +void VOL3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; diff --git a/src/apps/WIP-COUPLE.cpp b/src/apps/WIP-COUPLE.cpp index 3083bca33..0f25f5ee0 100644 --- a/src/apps/WIP-COUPLE.cpp +++ b/src/apps/WIP-COUPLE.cpp @@ -58,7 +58,7 @@ COUPLE::~COUPLE() delete m_domain; } -void COUPLE::setUp(VariantID vid, size_t /*tune_idx*/) +void COUPLE::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { Index_type max_loop_index = m_domain->lrn; @@ -189,7 +189,7 @@ void COUPLE::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_t2, max_loop_index); } -void COUPLE::tearDown(VariantID vid, size_t /*tune_idx*/) +void COUPLE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; diff --git a/src/apps/WIP-COUPLE.hpp b/src/apps/WIP-COUPLE.hpp index 0c7509096..cdafcd5eb 100644 --- a/src/apps/WIP-COUPLE.hpp +++ b/src/apps/WIP-COUPLE.hpp @@ -166,11 +166,11 @@ class COUPLE : public KernelBase void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); - void runSeqVariant(VariantID vid, size_t /*tune_idx*/) {(void) vid;} - void runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) {(void) vid;} - void runCudaVariant(VariantID vid, size_t /*tune_idx*/) {(void) vid;} - void runHipVariant(VariantID vid, size_t /*tune_idx*/) {(void) vid;} - void runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) {(void) vid;} + void runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {(void) vid;} + void runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {(void) vid;} + void runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {(void) vid;} + void runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {(void) vid;} + void runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {(void) vid;} private: Complex_ptr m_t0; diff --git a/src/basic/DAXPY-OMP.cpp b/src/basic/DAXPY-OMP.cpp index c72f08d43..a57e1709d 100644 --- a/src/basic/DAXPY-OMP.cpp +++ b/src/basic/DAXPY-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void DAXPY::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void DAXPY::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/DAXPY-OMPTarget.cpp b/src/basic/DAXPY-OMPTarget.cpp index 479dcca4e..a3862d80a 100644 --- a/src/basic/DAXPY-OMPTarget.cpp +++ b/src/basic/DAXPY-OMPTarget.cpp @@ -39,7 +39,7 @@ namespace basic deallocOpenMPDeviceData(y, did); -void DAXPY::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void DAXPY::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/DAXPY-Seq.cpp b/src/basic/DAXPY-Seq.cpp index 70f0710cb..3a262561f 100644 --- a/src/basic/DAXPY-Seq.cpp +++ b/src/basic/DAXPY-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void DAXPY::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void DAXPY::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp index e6caab998..6d6133eb6 100644 --- a/src/basic/DAXPY.cpp +++ b/src/basic/DAXPY.cpp @@ -57,7 +57,7 @@ DAXPY::~DAXPY() { } -void DAXPY::setUp(VariantID vid, size_t /*tune_idx*/) +void DAXPY::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_y, getActualProblemSize(), 0.0, vid); allocAndInitData(m_x, getActualProblemSize(), vid); @@ -69,7 +69,7 @@ void DAXPY::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid].at(tune_idx) += calcChecksum(m_y, getActualProblemSize()); } -void DAXPY::tearDown(VariantID vid, size_t /*tune_idx*/) +void DAXPY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_x); diff --git a/src/basic/DAXPY_ATOMIC-OMP.cpp b/src/basic/DAXPY_ATOMIC-OMP.cpp index 0e57d24d3..b28330d7e 100644 --- a/src/basic/DAXPY_ATOMIC-OMP.cpp +++ b/src/basic/DAXPY_ATOMIC-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void DAXPY_ATOMIC::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void DAXPY_ATOMIC::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/DAXPY_ATOMIC-OMPTarget.cpp b/src/basic/DAXPY_ATOMIC-OMPTarget.cpp index adb96dca0..7b19b0cf7 100644 --- a/src/basic/DAXPY_ATOMIC-OMPTarget.cpp +++ b/src/basic/DAXPY_ATOMIC-OMPTarget.cpp @@ -39,7 +39,7 @@ namespace basic deallocOpenMPDeviceData(y, did); -void DAXPY_ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void DAXPY_ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/DAXPY_ATOMIC-Seq.cpp b/src/basic/DAXPY_ATOMIC-Seq.cpp index 40aaac3ff..8eabef6cd 100644 --- a/src/basic/DAXPY_ATOMIC-Seq.cpp +++ b/src/basic/DAXPY_ATOMIC-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void DAXPY_ATOMIC::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void DAXPY_ATOMIC::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/DAXPY_ATOMIC.cpp b/src/basic/DAXPY_ATOMIC.cpp index 7e5d40c71..1e5d4e00e 100644 --- a/src/basic/DAXPY_ATOMIC.cpp +++ b/src/basic/DAXPY_ATOMIC.cpp @@ -57,7 +57,7 @@ DAXPY_ATOMIC::~DAXPY_ATOMIC() { } -void DAXPY_ATOMIC::setUp(VariantID vid, size_t /*tune_idx*/) +void DAXPY_ATOMIC::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_y, getActualProblemSize(), 0.0, vid); allocAndInitData(m_x, getActualProblemSize(), vid); @@ -69,7 +69,7 @@ void DAXPY_ATOMIC::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid].at(tune_idx) += calcChecksum(m_y, getActualProblemSize()); } -void DAXPY_ATOMIC::tearDown(VariantID vid, size_t /*tune_idx*/) +void DAXPY_ATOMIC::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_x); diff --git a/src/basic/IF_QUAD-OMP.cpp b/src/basic/IF_QUAD-OMP.cpp index d41fb3553..93ea37e88 100644 --- a/src/basic/IF_QUAD-OMP.cpp +++ b/src/basic/IF_QUAD-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void IF_QUAD::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void IF_QUAD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/IF_QUAD-OMPTarget.cpp b/src/basic/IF_QUAD-OMPTarget.cpp index a75c8ae1f..ca0a4ac0a 100644 --- a/src/basic/IF_QUAD-OMPTarget.cpp +++ b/src/basic/IF_QUAD-OMPTarget.cpp @@ -45,7 +45,7 @@ namespace basic deallocOpenMPDeviceData(x1, did); \ deallocOpenMPDeviceData(x2, did); -void IF_QUAD::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void IF_QUAD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/IF_QUAD-Seq.cpp b/src/basic/IF_QUAD-Seq.cpp index 1bb733bc9..cb303701d 100644 --- a/src/basic/IF_QUAD-Seq.cpp +++ b/src/basic/IF_QUAD-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void IF_QUAD::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void IF_QUAD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp index 789881577..69396d330 100644 --- a/src/basic/IF_QUAD.cpp +++ b/src/basic/IF_QUAD.cpp @@ -61,7 +61,7 @@ IF_QUAD::~IF_QUAD() { } -void IF_QUAD::setUp(VariantID vid, size_t /*tune_idx*/) +void IF_QUAD::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataRandSign(m_a, getActualProblemSize(), vid); allocAndInitData(m_b, getActualProblemSize(), vid); @@ -76,7 +76,7 @@ void IF_QUAD::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_x2, getActualProblemSize(), checksum_scale_factor ); } -void IF_QUAD::tearDown(VariantID vid, size_t /*tune_idx*/) +void IF_QUAD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_a); diff --git a/src/basic/INIT3-OMP.cpp b/src/basic/INIT3-OMP.cpp index 70cd4df1e..8df233cc5 100644 --- a/src/basic/INIT3-OMP.cpp +++ b/src/basic/INIT3-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void INIT3::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void INIT3::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/INIT3-OMPTarget.cpp b/src/basic/INIT3-OMPTarget.cpp index 10271a859..d2b5eb127 100644 --- a/src/basic/INIT3-OMPTarget.cpp +++ b/src/basic/INIT3-OMPTarget.cpp @@ -47,7 +47,7 @@ namespace basic deallocOpenMPDeviceData(in2, did); -void INIT3::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void INIT3::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/INIT3-Seq.cpp b/src/basic/INIT3-Seq.cpp index 429997569..1a1cb228a 100644 --- a/src/basic/INIT3-Seq.cpp +++ b/src/basic/INIT3-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void INIT3::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void INIT3::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp index 567ded16f..fc3fd024d 100644 --- a/src/basic/INIT3.cpp +++ b/src/basic/INIT3.cpp @@ -57,7 +57,7 @@ INIT3::~INIT3() { } -void INIT3::setUp(VariantID vid, size_t /*tune_idx*/) +void INIT3::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_out1, getActualProblemSize(), 0.0, vid); allocAndInitDataConst(m_out2, getActualProblemSize(), 0.0, vid); @@ -73,7 +73,7 @@ void INIT3::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_out3, getActualProblemSize()); } -void INIT3::tearDown(VariantID vid, size_t /*tune_idx*/) +void INIT3::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_out1); diff --git a/src/basic/INIT_VIEW1D-OMP.cpp b/src/basic/INIT_VIEW1D-OMP.cpp index b09cd2656..a0544574d 100644 --- a/src/basic/INIT_VIEW1D-OMP.cpp +++ b/src/basic/INIT_VIEW1D-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void INIT_VIEW1D::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void INIT_VIEW1D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/INIT_VIEW1D-OMPTarget.cpp b/src/basic/INIT_VIEW1D-OMPTarget.cpp index bedd30737..fba84b747 100644 --- a/src/basic/INIT_VIEW1D-OMPTarget.cpp +++ b/src/basic/INIT_VIEW1D-OMPTarget.cpp @@ -37,7 +37,7 @@ namespace basic deallocOpenMPDeviceData(a, did); -void INIT_VIEW1D::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void INIT_VIEW1D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/INIT_VIEW1D-Seq.cpp b/src/basic/INIT_VIEW1D-Seq.cpp index 483786c8f..f6df5969b 100644 --- a/src/basic/INIT_VIEW1D-Seq.cpp +++ b/src/basic/INIT_VIEW1D-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void INIT_VIEW1D::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void INIT_VIEW1D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp index 49545a622..bd752aa06 100644 --- a/src/basic/INIT_VIEW1D.cpp +++ b/src/basic/INIT_VIEW1D.cpp @@ -58,7 +58,7 @@ INIT_VIEW1D::~INIT_VIEW1D() { } -void INIT_VIEW1D::setUp(VariantID vid, size_t /*tune_idx*/) +void INIT_VIEW1D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_a, getActualProblemSize(), 0.0, vid); m_val = 0.00000123; @@ -69,7 +69,7 @@ void INIT_VIEW1D::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_a, getActualProblemSize()); } -void INIT_VIEW1D::tearDown(VariantID vid, size_t /*tune_idx*/) +void INIT_VIEW1D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_a); diff --git a/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp b/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp index 783cfbcd1..23a1c4e6f 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void INIT_VIEW1D_OFFSET::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void INIT_VIEW1D_OFFSET::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp index 75663f3bd..a3091a076 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp @@ -37,7 +37,7 @@ namespace basic deallocOpenMPDeviceData(a, did); -void INIT_VIEW1D_OFFSET::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void INIT_VIEW1D_OFFSET::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; diff --git a/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp b/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp index 435b54f3a..8b4db722b 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void INIT_VIEW1D_OFFSET::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void INIT_VIEW1D_OFFSET::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp index ae880290e..165cd5544 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET.cpp @@ -58,7 +58,7 @@ INIT_VIEW1D_OFFSET::~INIT_VIEW1D_OFFSET() { } -void INIT_VIEW1D_OFFSET::setUp(VariantID vid, size_t /*tune_idx*/) +void INIT_VIEW1D_OFFSET::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_a, getActualProblemSize(), 0.0, vid); m_val = 0.00000123; @@ -69,7 +69,7 @@ void INIT_VIEW1D_OFFSET::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_a, getActualProblemSize()); } -void INIT_VIEW1D_OFFSET::tearDown(VariantID vid, size_t /*tune_idx*/) +void INIT_VIEW1D_OFFSET::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_a); diff --git a/src/basic/MAT_MAT_SHARED-OMP.cpp b/src/basic/MAT_MAT_SHARED-OMP.cpp index 3773033ac..484550704 100644 --- a/src/basic/MAT_MAT_SHARED-OMP.cpp +++ b/src/basic/MAT_MAT_SHARED-OMP.cpp @@ -15,7 +15,7 @@ namespace rajaperf { namespace basic { -void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) { +void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) const Index_type run_reps = getRunReps(); diff --git a/src/basic/MAT_MAT_SHARED-OMPTarget.cpp b/src/basic/MAT_MAT_SHARED-OMPTarget.cpp index fdaab0d00..6dac3ee94 100644 --- a/src/basic/MAT_MAT_SHARED-OMPTarget.cpp +++ b/src/basic/MAT_MAT_SHARED-OMPTarget.cpp @@ -20,7 +20,7 @@ namespace rajaperf { namespace basic { - void MAT_MAT_SHARED::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) { + void MAT_MAT_SHARED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); switch (vid) { diff --git a/src/basic/MAT_MAT_SHARED-Seq.cpp b/src/basic/MAT_MAT_SHARED-Seq.cpp index 6ef835c1a..b412daa32 100644 --- a/src/basic/MAT_MAT_SHARED-Seq.cpp +++ b/src/basic/MAT_MAT_SHARED-Seq.cpp @@ -13,7 +13,7 @@ namespace rajaperf { namespace basic { -void MAT_MAT_SHARED::runSeqVariant(VariantID vid, size_t /*tune_idx*/) { +void MAT_MAT_SHARED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type N = m_N; diff --git a/src/basic/MAT_MAT_SHARED.cpp b/src/basic/MAT_MAT_SHARED.cpp index 09be79506..98cd878ce 100644 --- a/src/basic/MAT_MAT_SHARED.cpp +++ b/src/basic/MAT_MAT_SHARED.cpp @@ -64,7 +64,7 @@ MAT_MAT_SHARED::MAT_MAT_SHARED(const RunParams ¶ms) MAT_MAT_SHARED::~MAT_MAT_SHARED() {} -void MAT_MAT_SHARED::setUp(VariantID vid, size_t /*tune_idx*/) { +void MAT_MAT_SHARED::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type NN = m_N * m_N; allocAndInitDataConst(m_A, NN, 1.0, vid); @@ -76,7 +76,7 @@ void MAT_MAT_SHARED::updateChecksum(VariantID vid, size_t tune_idx) { checksum[vid][tune_idx] += calcChecksum(m_C, m_N*m_N, checksum_scale_factor ); } -void MAT_MAT_SHARED::tearDown(VariantID vid, size_t /*tune_idx*/) { +void MAT_MAT_SHARED::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void)vid; deallocData(m_A); deallocData(m_B); diff --git a/src/basic/MULADDSUB-OMP.cpp b/src/basic/MULADDSUB-OMP.cpp index 313c07d2c..1204e9018 100644 --- a/src/basic/MULADDSUB-OMP.cpp +++ b/src/basic/MULADDSUB-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void MULADDSUB::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void MULADDSUB::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/MULADDSUB-OMPTarget.cpp b/src/basic/MULADDSUB-OMPTarget.cpp index cf4909a26..2048284b5 100644 --- a/src/basic/MULADDSUB-OMPTarget.cpp +++ b/src/basic/MULADDSUB-OMPTarget.cpp @@ -47,7 +47,7 @@ namespace basic deallocOpenMPDeviceData(in2, did); -void MULADDSUB::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void MULADDSUB::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/MULADDSUB-Seq.cpp b/src/basic/MULADDSUB-Seq.cpp index d67809ffb..e93da7871 100644 --- a/src/basic/MULADDSUB-Seq.cpp +++ b/src/basic/MULADDSUB-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void MULADDSUB::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void MULADDSUB::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp index 8aca466b5..d1c180b8e 100644 --- a/src/basic/MULADDSUB.cpp +++ b/src/basic/MULADDSUB.cpp @@ -57,7 +57,7 @@ MULADDSUB::~MULADDSUB() { } -void MULADDSUB::setUp(VariantID vid, size_t /*tune_idx*/) +void MULADDSUB::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_out1, getActualProblemSize(), 0.0, vid); allocAndInitDataConst(m_out2, getActualProblemSize(), 0.0, vid); @@ -73,7 +73,7 @@ void MULADDSUB::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_out3, getActualProblemSize()); } -void MULADDSUB::tearDown(VariantID vid, size_t /*tune_idx*/) +void MULADDSUB::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_out1); diff --git a/src/basic/NESTED_INIT-OMP.cpp b/src/basic/NESTED_INIT-OMP.cpp index 4819bb69d..4471740df 100644 --- a/src/basic/NESTED_INIT-OMP.cpp +++ b/src/basic/NESTED_INIT-OMP.cpp @@ -21,7 +21,7 @@ namespace basic #undef USE_OMP_COLLAPSE -void NESTED_INIT::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void NESTED_INIT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/NESTED_INIT-OMPTarget.cpp b/src/basic/NESTED_INIT-OMPTarget.cpp index 3ca91e5fc..2c0b2389f 100644 --- a/src/basic/NESTED_INIT-OMPTarget.cpp +++ b/src/basic/NESTED_INIT-OMPTarget.cpp @@ -32,7 +32,7 @@ namespace basic deallocOpenMPDeviceData(array, did); -void NESTED_INIT::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void NESTED_INIT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/basic/NESTED_INIT-Seq.cpp b/src/basic/NESTED_INIT-Seq.cpp index 57a948c64..48da1b37a 100644 --- a/src/basic/NESTED_INIT-Seq.cpp +++ b/src/basic/NESTED_INIT-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void NESTED_INIT::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void NESTED_INIT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp index 9604e39ac..ef9550d97 100644 --- a/src/basic/NESTED_INIT.cpp +++ b/src/basic/NESTED_INIT.cpp @@ -68,7 +68,7 @@ NESTED_INIT::~NESTED_INIT() { } -void NESTED_INIT::setUp(VariantID vid, size_t /*tune_idx*/) +void NESTED_INIT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_array, m_array_length, 0.0, vid); } @@ -78,7 +78,7 @@ void NESTED_INIT::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_array, m_array_length); } -void NESTED_INIT::tearDown(VariantID vid, size_t /*tune_idx*/) +void NESTED_INIT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; RAJA::free_aligned(m_array); diff --git a/src/basic/PI_ATOMIC-OMP.cpp b/src/basic/PI_ATOMIC-OMP.cpp index 75e6cb493..4296ed845 100644 --- a/src/basic/PI_ATOMIC-OMP.cpp +++ b/src/basic/PI_ATOMIC-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void PI_ATOMIC::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void PI_ATOMIC::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/PI_ATOMIC-OMPTarget.cpp b/src/basic/PI_ATOMIC-OMPTarget.cpp index e395d46d9..2a059f99b 100644 --- a/src/basic/PI_ATOMIC-OMPTarget.cpp +++ b/src/basic/PI_ATOMIC-OMPTarget.cpp @@ -36,7 +36,7 @@ namespace basic deallocOpenMPDeviceData(pi, did); -void PI_ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void PI_ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/PI_ATOMIC-Seq.cpp b/src/basic/PI_ATOMIC-Seq.cpp index b92bf8c54..486201caa 100644 --- a/src/basic/PI_ATOMIC-Seq.cpp +++ b/src/basic/PI_ATOMIC-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void PI_ATOMIC::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void PI_ATOMIC::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp index 005a40ccd..776883232 100644 --- a/src/basic/PI_ATOMIC.cpp +++ b/src/basic/PI_ATOMIC.cpp @@ -59,7 +59,7 @@ PI_ATOMIC::~PI_ATOMIC() { } -void PI_ATOMIC::setUp(VariantID vid, size_t /*tune_idx*/) +void PI_ATOMIC::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { m_dx = 1.0 / double(getActualProblemSize()); allocAndInitDataConst(m_pi, 1, 0.0, vid); @@ -71,7 +71,7 @@ void PI_ATOMIC::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += Checksum_type(*m_pi); } -void PI_ATOMIC::tearDown(VariantID vid, size_t /*tune_idx*/) +void PI_ATOMIC::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_pi); diff --git a/src/basic/PI_REDUCE-OMP.cpp b/src/basic/PI_REDUCE-OMP.cpp index cefa9dc9f..bc03012c3 100644 --- a/src/basic/PI_REDUCE-OMP.cpp +++ b/src/basic/PI_REDUCE-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void PI_REDUCE::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void PI_REDUCE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/PI_REDUCE-OMPTarget.cpp b/src/basic/PI_REDUCE-OMPTarget.cpp index f1740d159..a942839b4 100644 --- a/src/basic/PI_REDUCE-OMPTarget.cpp +++ b/src/basic/PI_REDUCE-OMPTarget.cpp @@ -27,7 +27,7 @@ namespace basic const size_t threads_per_team = 256; -void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/PI_REDUCE-Seq.cpp b/src/basic/PI_REDUCE-Seq.cpp index 58cb06e80..6d6f885fe 100644 --- a/src/basic/PI_REDUCE-Seq.cpp +++ b/src/basic/PI_REDUCE-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void PI_REDUCE::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void PI_REDUCE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/PI_REDUCE.cpp b/src/basic/PI_REDUCE.cpp index c093d16cb..16d0770ba 100644 --- a/src/basic/PI_REDUCE.cpp +++ b/src/basic/PI_REDUCE.cpp @@ -57,7 +57,7 @@ PI_REDUCE::~PI_REDUCE() { } -void PI_REDUCE::setUp(VariantID vid, size_t /*tune_idx*/) +void PI_REDUCE::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; m_dx = 1.0 / double(getActualProblemSize()); @@ -70,7 +70,7 @@ void PI_REDUCE::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += Checksum_type(m_pi); } -void PI_REDUCE::tearDown(VariantID vid, size_t /*tune_idx*/) +void PI_REDUCE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; } diff --git a/src/basic/REDUCE3_INT-OMP.cpp b/src/basic/REDUCE3_INT-OMP.cpp index 01c26977d..0f759180b 100644 --- a/src/basic/REDUCE3_INT-OMP.cpp +++ b/src/basic/REDUCE3_INT-OMP.cpp @@ -19,7 +19,7 @@ namespace basic { -void REDUCE3_INT::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void REDUCE3_INT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/REDUCE3_INT-OMPTarget.cpp b/src/basic/REDUCE3_INT-OMPTarget.cpp index 98efe668a..7db4bbdd6 100644 --- a/src/basic/REDUCE3_INT-OMPTarget.cpp +++ b/src/basic/REDUCE3_INT-OMPTarget.cpp @@ -36,7 +36,7 @@ namespace basic deallocOpenMPDeviceData(vec, did); \ -void REDUCE3_INT::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void REDUCE3_INT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/REDUCE3_INT-Seq.cpp b/src/basic/REDUCE3_INT-Seq.cpp index 3ad721f04..4690319cc 100644 --- a/src/basic/REDUCE3_INT-Seq.cpp +++ b/src/basic/REDUCE3_INT-Seq.cpp @@ -19,7 +19,7 @@ namespace basic { -void REDUCE3_INT::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void REDUCE3_INT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp index 14e47e201..dee6d3a5e 100644 --- a/src/basic/REDUCE3_INT.cpp +++ b/src/basic/REDUCE3_INT.cpp @@ -62,7 +62,7 @@ REDUCE3_INT::~REDUCE3_INT() { } -void REDUCE3_INT::setUp(VariantID vid, size_t /*tune_idx*/) +void REDUCE3_INT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitData(m_vec, getActualProblemSize(), vid); @@ -81,7 +81,7 @@ void REDUCE3_INT::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += m_vmax; } -void REDUCE3_INT::tearDown(VariantID vid, size_t /*tune_idx*/) +void REDUCE3_INT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_vec); diff --git a/src/basic/TRAP_INT-OMP.cpp b/src/basic/TRAP_INT-OMP.cpp index b18beff6c..bd4c3c24b 100644 --- a/src/basic/TRAP_INT-OMP.cpp +++ b/src/basic/TRAP_INT-OMP.cpp @@ -32,7 +32,7 @@ Real_type trap_int_func(Real_type x, } -void TRAP_INT::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void TRAP_INT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/TRAP_INT-OMPTarget.cpp b/src/basic/TRAP_INT-OMPTarget.cpp index 78b57c1db..53dab376a 100644 --- a/src/basic/TRAP_INT-OMPTarget.cpp +++ b/src/basic/TRAP_INT-OMPTarget.cpp @@ -46,7 +46,7 @@ Real_type trap_int_func(Real_type x, #define TRAP_INT_DATA_TEARDOWN_OMP_TARGET // nothing to do here... -void TRAP_INT::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void TRAP_INT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/TRAP_INT-Seq.cpp b/src/basic/TRAP_INT-Seq.cpp index b8af73291..310d5e9ef 100644 --- a/src/basic/TRAP_INT-Seq.cpp +++ b/src/basic/TRAP_INT-Seq.cpp @@ -32,7 +32,7 @@ Real_type trap_int_func(Real_type x, } -void TRAP_INT::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void TRAP_INT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp index 8e5612c19..3bf939f38 100644 --- a/src/basic/TRAP_INT.cpp +++ b/src/basic/TRAP_INT.cpp @@ -57,7 +57,7 @@ TRAP_INT::~TRAP_INT() { } -void TRAP_INT::setUp(VariantID vid, size_t /*tune_idx*/) +void TRAP_INT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { Real_type xn; initData(xn, vid); @@ -79,7 +79,7 @@ void TRAP_INT::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += m_sumx; } -void TRAP_INT::tearDown(VariantID vid, size_t /*tune_idx*/) +void TRAP_INT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; } diff --git a/src/lcals/DIFF_PREDICT-OMP.cpp b/src/lcals/DIFF_PREDICT-OMP.cpp index 2175d7f61..e83c208bd 100644 --- a/src/lcals/DIFF_PREDICT-OMP.cpp +++ b/src/lcals/DIFF_PREDICT-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void DIFF_PREDICT::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void DIFF_PREDICT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/DIFF_PREDICT-OMPTarget.cpp b/src/lcals/DIFF_PREDICT-OMPTarget.cpp index 809138bec..44e78452f 100644 --- a/src/lcals/DIFF_PREDICT-OMPTarget.cpp +++ b/src/lcals/DIFF_PREDICT-OMPTarget.cpp @@ -39,7 +39,7 @@ namespace lcals deallocOpenMPDeviceData(cx, did); -void DIFF_PREDICT::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void DIFF_PREDICT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/DIFF_PREDICT-Seq.cpp b/src/lcals/DIFF_PREDICT-Seq.cpp index 959a79324..bff82a6eb 100644 --- a/src/lcals/DIFF_PREDICT-Seq.cpp +++ b/src/lcals/DIFF_PREDICT-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void DIFF_PREDICT::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void DIFF_PREDICT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp index 5d021409f..338ba7d0d 100644 --- a/src/lcals/DIFF_PREDICT.cpp +++ b/src/lcals/DIFF_PREDICT.cpp @@ -55,7 +55,7 @@ DIFF_PREDICT::~DIFF_PREDICT() { } -void DIFF_PREDICT::setUp(VariantID vid, size_t /*tune_idx*/) +void DIFF_PREDICT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { m_array_length = getActualProblemSize() * 14; m_offset = getActualProblemSize(); @@ -69,7 +69,7 @@ void DIFF_PREDICT::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_px, m_array_length); } -void DIFF_PREDICT::tearDown(VariantID vid, size_t /*tune_idx*/) +void DIFF_PREDICT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_px); diff --git a/src/lcals/EOS-OMP.cpp b/src/lcals/EOS-OMP.cpp index 389fa68db..4a9688f03 100644 --- a/src/lcals/EOS-OMP.cpp +++ b/src/lcals/EOS-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void EOS::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void EOS::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/EOS-OMPTarget.cpp b/src/lcals/EOS-OMPTarget.cpp index be52900f7..6cc2f832b 100644 --- a/src/lcals/EOS-OMPTarget.cpp +++ b/src/lcals/EOS-OMPTarget.cpp @@ -43,7 +43,7 @@ namespace lcals deallocOpenMPDeviceData(u, did); -void EOS::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void EOS::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/EOS-Seq.cpp b/src/lcals/EOS-Seq.cpp index 1bb362b70..3aaeabdde 100644 --- a/src/lcals/EOS-Seq.cpp +++ b/src/lcals/EOS-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void EOS::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void EOS::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp index 7e2aaad73..27bc43d06 100644 --- a/src/lcals/EOS.cpp +++ b/src/lcals/EOS.cpp @@ -63,7 +63,7 @@ EOS::~EOS() { } -void EOS::setUp(VariantID vid, size_t /*tune_idx*/) +void EOS::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_x, m_array_length, 0.0, vid); allocAndInitData(m_y, m_array_length, vid); @@ -80,7 +80,7 @@ void EOS::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize(), checksum_scale_factor ); } -void EOS::tearDown(VariantID vid, size_t /*tune_idx*/) +void EOS::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_x); diff --git a/src/lcals/FIRST_DIFF-OMP.cpp b/src/lcals/FIRST_DIFF-OMP.cpp index 88ffe72e8..73c945f3a 100644 --- a/src/lcals/FIRST_DIFF-OMP.cpp +++ b/src/lcals/FIRST_DIFF-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void FIRST_DIFF::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void FIRST_DIFF::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/FIRST_DIFF-OMPTarget.cpp b/src/lcals/FIRST_DIFF-OMPTarget.cpp index a40e67cd1..13c9a9888 100644 --- a/src/lcals/FIRST_DIFF-OMPTarget.cpp +++ b/src/lcals/FIRST_DIFF-OMPTarget.cpp @@ -39,7 +39,7 @@ namespace lcals deallocOpenMPDeviceData(y, did); -void FIRST_DIFF::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void FIRST_DIFF::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/FIRST_DIFF-Seq.cpp b/src/lcals/FIRST_DIFF-Seq.cpp index d30fc35eb..41837ff90 100644 --- a/src/lcals/FIRST_DIFF-Seq.cpp +++ b/src/lcals/FIRST_DIFF-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void FIRST_DIFF::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void FIRST_DIFF::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp index 340f2cba9..9272b20d4 100644 --- a/src/lcals/FIRST_DIFF.cpp +++ b/src/lcals/FIRST_DIFF.cpp @@ -59,7 +59,7 @@ FIRST_DIFF::~FIRST_DIFF() { } -void FIRST_DIFF::setUp(VariantID vid, size_t /*tune_idx*/) +void FIRST_DIFF::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_x, m_N, 0.0, vid); allocAndInitData(m_y, m_N, vid); @@ -70,7 +70,7 @@ void FIRST_DIFF::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize()); } -void FIRST_DIFF::tearDown(VariantID vid, size_t /*tune_idx*/) +void FIRST_DIFF::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_x); diff --git a/src/lcals/FIRST_MIN-OMP.cpp b/src/lcals/FIRST_MIN-OMP.cpp index 7940dd4d2..ef7791739 100644 --- a/src/lcals/FIRST_MIN-OMP.cpp +++ b/src/lcals/FIRST_MIN-OMP.cpp @@ -19,7 +19,7 @@ namespace lcals FIRST_MIN_MINLOC_COMPARE; -void FIRST_MIN::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void FIRST_MIN::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/FIRST_MIN-OMPTarget.cpp b/src/lcals/FIRST_MIN-OMPTarget.cpp index 4d9c6e1c2..52472c588 100644 --- a/src/lcals/FIRST_MIN-OMPTarget.cpp +++ b/src/lcals/FIRST_MIN-OMPTarget.cpp @@ -37,7 +37,7 @@ namespace lcals FIRST_MIN_MINLOC_COMPARE; -void FIRST_MIN::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void FIRST_MIN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/FIRST_MIN-Seq.cpp b/src/lcals/FIRST_MIN-Seq.cpp index 8bb0e7fb6..7bb311675 100644 --- a/src/lcals/FIRST_MIN-Seq.cpp +++ b/src/lcals/FIRST_MIN-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void FIRST_MIN::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void FIRST_MIN::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp index c5c20eb4d..8fe9a8c93 100644 --- a/src/lcals/FIRST_MIN.cpp +++ b/src/lcals/FIRST_MIN.cpp @@ -63,7 +63,7 @@ FIRST_MIN::~FIRST_MIN() { } -void FIRST_MIN::setUp(VariantID vid, size_t /*tune_idx*/) +void FIRST_MIN::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_x, m_N, 0.0, vid); m_x[ m_N / 2 ] = -1.0e+10; @@ -77,7 +77,7 @@ void FIRST_MIN::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += static_cast(m_minloc); } -void FIRST_MIN::tearDown(VariantID vid, size_t /*tune_idx*/) +void FIRST_MIN::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_x); diff --git a/src/lcals/FIRST_SUM-OMP.cpp b/src/lcals/FIRST_SUM-OMP.cpp index a6810b4fb..58d1a1070 100644 --- a/src/lcals/FIRST_SUM-OMP.cpp +++ b/src/lcals/FIRST_SUM-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void FIRST_SUM::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void FIRST_SUM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/FIRST_SUM-OMPTarget.cpp b/src/lcals/FIRST_SUM-OMPTarget.cpp index 5905847e6..afc53dd6c 100644 --- a/src/lcals/FIRST_SUM-OMPTarget.cpp +++ b/src/lcals/FIRST_SUM-OMPTarget.cpp @@ -39,7 +39,7 @@ namespace lcals deallocOpenMPDeviceData(y, did); -void FIRST_SUM::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void FIRST_SUM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; diff --git a/src/lcals/FIRST_SUM-Seq.cpp b/src/lcals/FIRST_SUM-Seq.cpp index 8d40d16c8..29417f4c1 100644 --- a/src/lcals/FIRST_SUM-Seq.cpp +++ b/src/lcals/FIRST_SUM-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void FIRST_SUM::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void FIRST_SUM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp index 06d4e125c..a9d135446 100644 --- a/src/lcals/FIRST_SUM.cpp +++ b/src/lcals/FIRST_SUM.cpp @@ -58,7 +58,7 @@ FIRST_SUM::~FIRST_SUM() { } -void FIRST_SUM::setUp(VariantID vid, size_t /*tune_idx*/) +void FIRST_SUM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_x, m_N, 0.0, vid); allocAndInitData(m_y, m_N, vid); @@ -69,7 +69,7 @@ void FIRST_SUM::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize()); } -void FIRST_SUM::tearDown(VariantID vid, size_t /*tune_idx*/) +void FIRST_SUM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_x); diff --git a/src/lcals/GEN_LIN_RECUR-OMP.cpp b/src/lcals/GEN_LIN_RECUR-OMP.cpp index 3ca9b7c09..3d40a9e47 100644 --- a/src/lcals/GEN_LIN_RECUR-OMP.cpp +++ b/src/lcals/GEN_LIN_RECUR-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void GEN_LIN_RECUR::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void GEN_LIN_RECUR::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp b/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp index e4653e190..1949698fd 100644 --- a/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp +++ b/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp @@ -43,7 +43,7 @@ namespace lcals deallocOpenMPDeviceData(sb, did); -void GEN_LIN_RECUR::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void GEN_LIN_RECUR::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/lcals/GEN_LIN_RECUR-Seq.cpp b/src/lcals/GEN_LIN_RECUR-Seq.cpp index a748932f4..efde12463 100644 --- a/src/lcals/GEN_LIN_RECUR-Seq.cpp +++ b/src/lcals/GEN_LIN_RECUR-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void GEN_LIN_RECUR::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void GEN_LIN_RECUR::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/lcals/GEN_LIN_RECUR.cpp b/src/lcals/GEN_LIN_RECUR.cpp index 3fcb9cb39..b0598aa8e 100644 --- a/src/lcals/GEN_LIN_RECUR.cpp +++ b/src/lcals/GEN_LIN_RECUR.cpp @@ -63,7 +63,7 @@ GEN_LIN_RECUR::~GEN_LIN_RECUR() { } -void GEN_LIN_RECUR::setUp(VariantID vid, size_t /*tune_idx*/) +void GEN_LIN_RECUR::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { m_kb5i = 0; @@ -78,7 +78,7 @@ void GEN_LIN_RECUR::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_b5, getActualProblemSize(), checksum_scale_factor ); } -void GEN_LIN_RECUR::tearDown(VariantID vid, size_t /*tune_idx*/) +void GEN_LIN_RECUR::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_b5); diff --git a/src/lcals/HYDRO_1D-OMP.cpp b/src/lcals/HYDRO_1D-OMP.cpp index 166362713..29ea4db01 100644 --- a/src/lcals/HYDRO_1D-OMP.cpp +++ b/src/lcals/HYDRO_1D-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void HYDRO_1D::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void HYDRO_1D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/HYDRO_1D-OMPTarget.cpp b/src/lcals/HYDRO_1D-OMPTarget.cpp index 98742124b..d154b473f 100644 --- a/src/lcals/HYDRO_1D-OMPTarget.cpp +++ b/src/lcals/HYDRO_1D-OMPTarget.cpp @@ -41,7 +41,7 @@ namespace lcals deallocOpenMPDeviceData(z, did); \ -void HYDRO_1D::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void HYDRO_1D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/HYDRO_1D-Seq.cpp b/src/lcals/HYDRO_1D-Seq.cpp index 59a48e1db..2833cf6bc 100644 --- a/src/lcals/HYDRO_1D-Seq.cpp +++ b/src/lcals/HYDRO_1D-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void HYDRO_1D::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void HYDRO_1D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp index 88c650de7..5ce1d0700 100644 --- a/src/lcals/HYDRO_1D.cpp +++ b/src/lcals/HYDRO_1D.cpp @@ -62,7 +62,7 @@ HYDRO_1D::~HYDRO_1D() { } -void HYDRO_1D::setUp(VariantID vid, size_t /*tune_idx*/) +void HYDRO_1D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_x, m_array_length, 0.0, vid); allocAndInitData(m_y, m_array_length, vid); @@ -78,7 +78,7 @@ void HYDRO_1D::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_x, getActualProblemSize(), checksum_scale_factor ); } -void HYDRO_1D::tearDown(VariantID vid, size_t /*tune_idx*/) +void HYDRO_1D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_x); diff --git a/src/lcals/HYDRO_2D-OMP.cpp b/src/lcals/HYDRO_2D-OMP.cpp index 1557d0015..532ee258c 100644 --- a/src/lcals/HYDRO_2D-OMP.cpp +++ b/src/lcals/HYDRO_2D-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void HYDRO_2D::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void HYDRO_2D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/HYDRO_2D-OMPTarget.cpp b/src/lcals/HYDRO_2D-OMPTarget.cpp index a0e46a55d..18e6ff004 100644 --- a/src/lcals/HYDRO_2D-OMPTarget.cpp +++ b/src/lcals/HYDRO_2D-OMPTarget.cpp @@ -54,7 +54,7 @@ namespace lcals -void HYDRO_2D::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void HYDRO_2D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type kbeg = 1; diff --git a/src/lcals/HYDRO_2D-Seq.cpp b/src/lcals/HYDRO_2D-Seq.cpp index 5f669eea3..3db534ffc 100644 --- a/src/lcals/HYDRO_2D-Seq.cpp +++ b/src/lcals/HYDRO_2D-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void HYDRO_2D::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void HYDRO_2D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type kbeg = 1; diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp index 46c98729c..9b6c2a643 100644 --- a/src/lcals/HYDRO_2D.cpp +++ b/src/lcals/HYDRO_2D.cpp @@ -77,7 +77,7 @@ HYDRO_2D::~HYDRO_2D() { } -void HYDRO_2D::setUp(VariantID vid, size_t /*tune_idx*/) +void HYDRO_2D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_zrout, m_array_length, 0.0, vid); allocAndInitDataConst(m_zzout, m_array_length, 0.0, vid); @@ -98,7 +98,7 @@ void HYDRO_2D::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_zrout, m_array_length, checksum_scale_factor ); } -void HYDRO_2D::tearDown(VariantID vid, size_t /*tune_idx*/) +void HYDRO_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_zrout); diff --git a/src/lcals/INT_PREDICT-OMP.cpp b/src/lcals/INT_PREDICT-OMP.cpp index cc06147ff..4e67db9c4 100644 --- a/src/lcals/INT_PREDICT-OMP.cpp +++ b/src/lcals/INT_PREDICT-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void INT_PREDICT::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void INT_PREDICT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/INT_PREDICT-OMPTarget.cpp b/src/lcals/INT_PREDICT-OMPTarget.cpp index 95e91d772..86b8b8169 100644 --- a/src/lcals/INT_PREDICT-OMPTarget.cpp +++ b/src/lcals/INT_PREDICT-OMPTarget.cpp @@ -37,7 +37,7 @@ namespace lcals deallocOpenMPDeviceData(px, did); -void INT_PREDICT::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void INT_PREDICT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/INT_PREDICT-Seq.cpp b/src/lcals/INT_PREDICT-Seq.cpp index e5b648fd6..83a41071b 100644 --- a/src/lcals/INT_PREDICT-Seq.cpp +++ b/src/lcals/INT_PREDICT-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void INT_PREDICT::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void INT_PREDICT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp index d15013b49..c2062fffa 100644 --- a/src/lcals/INT_PREDICT.cpp +++ b/src/lcals/INT_PREDICT.cpp @@ -55,7 +55,7 @@ INT_PREDICT::~INT_PREDICT() { } -void INT_PREDICT::setUp(VariantID vid, size_t /*tune_idx*/) +void INT_PREDICT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { m_array_length = getActualProblemSize() * 13; m_offset = getActualProblemSize(); @@ -82,7 +82,7 @@ void INT_PREDICT::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_px, getActualProblemSize()); } -void INT_PREDICT::tearDown(VariantID vid, size_t /*tune_idx*/) +void INT_PREDICT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_px); diff --git a/src/lcals/PLANCKIAN-OMP.cpp b/src/lcals/PLANCKIAN-OMP.cpp index 98ca0b135..e802a96fd 100644 --- a/src/lcals/PLANCKIAN-OMP.cpp +++ b/src/lcals/PLANCKIAN-OMP.cpp @@ -19,7 +19,7 @@ namespace lcals { -void PLANCKIAN::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void PLANCKIAN::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/PLANCKIAN-OMPTarget.cpp b/src/lcals/PLANCKIAN-OMPTarget.cpp index cafbbc708..02858604f 100644 --- a/src/lcals/PLANCKIAN-OMPTarget.cpp +++ b/src/lcals/PLANCKIAN-OMPTarget.cpp @@ -46,7 +46,7 @@ namespace lcals deallocOpenMPDeviceData(w, did); -void PLANCKIAN::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void PLANCKIAN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/PLANCKIAN-Seq.cpp b/src/lcals/PLANCKIAN-Seq.cpp index 61db64ef6..efd372444 100644 --- a/src/lcals/PLANCKIAN-Seq.cpp +++ b/src/lcals/PLANCKIAN-Seq.cpp @@ -19,7 +19,7 @@ namespace lcals { -void PLANCKIAN::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void PLANCKIAN::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp index 439eeaa52..59de57231 100644 --- a/src/lcals/PLANCKIAN.cpp +++ b/src/lcals/PLANCKIAN.cpp @@ -55,7 +55,7 @@ PLANCKIAN::~PLANCKIAN() { } -void PLANCKIAN::setUp(VariantID vid, size_t /*tune_idx*/) +void PLANCKIAN::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitData(m_x, getActualProblemSize(), vid); allocAndInitData(m_y, getActualProblemSize(), vid); @@ -69,7 +69,7 @@ void PLANCKIAN::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_w, getActualProblemSize()); } -void PLANCKIAN::tearDown(VariantID vid, size_t /*tune_idx*/) +void PLANCKIAN::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_x); diff --git a/src/lcals/TRIDIAG_ELIM-OMP.cpp b/src/lcals/TRIDIAG_ELIM-OMP.cpp index 0d2671ddc..a78c4a210 100644 --- a/src/lcals/TRIDIAG_ELIM-OMP.cpp +++ b/src/lcals/TRIDIAG_ELIM-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void TRIDIAG_ELIM::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void TRIDIAG_ELIM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp b/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp index 0d4190b26..ff21303da 100644 --- a/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp +++ b/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp @@ -43,7 +43,7 @@ namespace lcals deallocOpenMPDeviceData(z, did); -void TRIDIAG_ELIM::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void TRIDIAG_ELIM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; diff --git a/src/lcals/TRIDIAG_ELIM-Seq.cpp b/src/lcals/TRIDIAG_ELIM-Seq.cpp index dc32f6356..8aa6dc451 100644 --- a/src/lcals/TRIDIAG_ELIM-Seq.cpp +++ b/src/lcals/TRIDIAG_ELIM-Seq.cpp @@ -18,7 +18,7 @@ namespace lcals { -void TRIDIAG_ELIM::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void TRIDIAG_ELIM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp index 8288351d1..05d0100a8 100644 --- a/src/lcals/TRIDIAG_ELIM.cpp +++ b/src/lcals/TRIDIAG_ELIM.cpp @@ -57,7 +57,7 @@ TRIDIAG_ELIM::~TRIDIAG_ELIM() { } -void TRIDIAG_ELIM::setUp(VariantID vid, size_t /*tune_idx*/) +void TRIDIAG_ELIM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_xout, m_N, 0.0, vid); allocAndInitData(m_xin, m_N, vid); @@ -70,7 +70,7 @@ void TRIDIAG_ELIM::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_xout, getActualProblemSize()); } -void TRIDIAG_ELIM::tearDown(VariantID vid, size_t /*tune_idx*/) +void TRIDIAG_ELIM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_xout); diff --git a/src/polybench/POLYBENCH_2MM-OMP.cpp b/src/polybench/POLYBENCH_2MM-OMP.cpp index 2d4066a8a..687f93c45 100644 --- a/src/polybench/POLYBENCH_2MM-OMP.cpp +++ b/src/polybench/POLYBENCH_2MM-OMP.cpp @@ -26,7 +26,7 @@ namespace polybench { -void POLYBENCH_2MM::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_2MM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_2MM-OMPTarget.cpp b/src/polybench/POLYBENCH_2MM-OMPTarget.cpp index e932fb737..ab7860935 100644 --- a/src/polybench/POLYBENCH_2MM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_2MM-OMPTarget.cpp @@ -41,7 +41,7 @@ namespace polybench deallocOpenMPDeviceData(D, did); -void POLYBENCH_2MM::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_2MM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_2MM-Seq.cpp b/src/polybench/POLYBENCH_2MM-Seq.cpp index 90e167b82..6e59576b1 100644 --- a/src/polybench/POLYBENCH_2MM-Seq.cpp +++ b/src/polybench/POLYBENCH_2MM-Seq.cpp @@ -18,7 +18,7 @@ namespace rajaperf namespace polybench { -void POLYBENCH_2MM::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_2MM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index 3d794dce7..03119a863 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -84,7 +84,7 @@ POLYBENCH_2MM::~POLYBENCH_2MM() { } -void POLYBENCH_2MM::setUp(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_2MM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; allocAndInitData(m_tmp, m_ni * m_nj, vid); @@ -99,7 +99,7 @@ void POLYBENCH_2MM::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_D, m_ni * m_nl, checksum_scale_factor ); } -void POLYBENCH_2MM::tearDown(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_2MM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_tmp); diff --git a/src/polybench/POLYBENCH_3MM-OMP.cpp b/src/polybench/POLYBENCH_3MM-OMP.cpp index edc21e2cd..a45f4dd28 100644 --- a/src/polybench/POLYBENCH_3MM-OMP.cpp +++ b/src/polybench/POLYBENCH_3MM-OMP.cpp @@ -27,7 +27,7 @@ namespace polybench { -void POLYBENCH_3MM::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_3MM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_3MM-OMPTarget.cpp b/src/polybench/POLYBENCH_3MM-OMPTarget.cpp index 6326ef629..21c1ce7fa 100644 --- a/src/polybench/POLYBENCH_3MM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_3MM-OMPTarget.cpp @@ -44,7 +44,7 @@ namespace polybench deallocOpenMPDeviceData(F, did); \ deallocOpenMPDeviceData(G, did); -void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_3MM-Seq.cpp b/src/polybench/POLYBENCH_3MM-Seq.cpp index 0857ac6b4..c1ca8c56d 100644 --- a/src/polybench/POLYBENCH_3MM-Seq.cpp +++ b/src/polybench/POLYBENCH_3MM-Seq.cpp @@ -20,7 +20,7 @@ namespace polybench { -void POLYBENCH_3MM::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_3MM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp index 08d2179fb..75990394c 100644 --- a/src/polybench/POLYBENCH_3MM.cpp +++ b/src/polybench/POLYBENCH_3MM.cpp @@ -92,7 +92,7 @@ POLYBENCH_3MM::~POLYBENCH_3MM() { } -void POLYBENCH_3MM::setUp(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_3MM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; allocAndInitData(m_A, m_ni * m_nk, vid); @@ -109,7 +109,7 @@ void POLYBENCH_3MM::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_G, m_ni * m_nl, checksum_scale_factor ); } -void POLYBENCH_3MM::tearDown(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_3MM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_A); diff --git a/src/polybench/POLYBENCH_ADI-OMP.cpp b/src/polybench/POLYBENCH_ADI-OMP.cpp index 71e9218d6..a9409b182 100644 --- a/src/polybench/POLYBENCH_ADI-OMP.cpp +++ b/src/polybench/POLYBENCH_ADI-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_ADI::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_ADI::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_ADI-OMPTarget.cpp b/src/polybench/POLYBENCH_ADI-OMPTarget.cpp index e2dd834fc..a3cc71346 100644 --- a/src/polybench/POLYBENCH_ADI-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_ADI-OMPTarget.cpp @@ -43,7 +43,7 @@ namespace polybench deallocOpenMPDeviceData(Q, did); -void POLYBENCH_ADI::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_ADI::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_ADI-Seq.cpp b/src/polybench/POLYBENCH_ADI-Seq.cpp index 69ab50df1..854a0fdf9 100644 --- a/src/polybench/POLYBENCH_ADI-Seq.cpp +++ b/src/polybench/POLYBENCH_ADI-Seq.cpp @@ -18,7 +18,7 @@ namespace rajaperf namespace polybench { -void POLYBENCH_ADI::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_ADI::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp index 3c3fd6a05..7d0844e69 100644 --- a/src/polybench/POLYBENCH_ADI.cpp +++ b/src/polybench/POLYBENCH_ADI.cpp @@ -69,7 +69,7 @@ POLYBENCH_ADI::~POLYBENCH_ADI() { } -void POLYBENCH_ADI::setUp(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_ADI::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_U, m_n * m_n, 0.0, vid); allocAndInitData(m_V, m_n * m_n, vid); @@ -82,7 +82,7 @@ void POLYBENCH_ADI::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_U, m_n * m_n, checksum_scale_factor ); } -void POLYBENCH_ADI::tearDown(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_ADI::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_U); diff --git a/src/polybench/POLYBENCH_ATAX-OMP.cpp b/src/polybench/POLYBENCH_ATAX-OMP.cpp index 5c9f84400..504a293a3 100644 --- a/src/polybench/POLYBENCH_ATAX-OMP.cpp +++ b/src/polybench/POLYBENCH_ATAX-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_ATAX::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_ATAX::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp b/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp index 154cb6a46..1f9c23844 100644 --- a/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp @@ -44,7 +44,7 @@ namespace polybench deallocOpenMPDeviceData(A, did); -void POLYBENCH_ATAX::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_ATAX::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_ATAX-Seq.cpp b/src/polybench/POLYBENCH_ATAX-Seq.cpp index 5a52cac03..ecb98f3e8 100644 --- a/src/polybench/POLYBENCH_ATAX-Seq.cpp +++ b/src/polybench/POLYBENCH_ATAX-Seq.cpp @@ -18,7 +18,7 @@ namespace rajaperf namespace polybench { -void POLYBENCH_ATAX::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_ATAX::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp index 8a25c1cd7..44a805518 100644 --- a/src/polybench/POLYBENCH_ATAX.cpp +++ b/src/polybench/POLYBENCH_ATAX.cpp @@ -71,7 +71,7 @@ POLYBENCH_ATAX::~POLYBENCH_ATAX() { } -void POLYBENCH_ATAX::setUp(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_ATAX::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; allocAndInitData(m_tmp, m_N, vid); @@ -85,7 +85,7 @@ void POLYBENCH_ATAX::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_y, m_N, checksum_scale_factor ); } -void POLYBENCH_ATAX::tearDown(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_ATAX::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_tmp); diff --git a/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp b/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp index 57eeddf33..dba8a872a 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp @@ -18,7 +18,7 @@ namespace polybench { -void POLYBENCH_FDTD_2D::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_FDTD_2D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp b/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp index 775451df7..5bb0d03b3 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp @@ -43,7 +43,7 @@ namespace polybench deallocOpenMPDeviceData(fict, did); -void POLYBENCH_FDTD_2D::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_FDTD_2D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp b/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp index d1759d8a5..6ab94557d 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp @@ -18,7 +18,7 @@ namespace polybench { -void POLYBENCH_FDTD_2D::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_FDTD_2D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp index 3ace1b816..dce05e76a 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D.cpp @@ -90,7 +90,7 @@ POLYBENCH_FDTD_2D::~POLYBENCH_FDTD_2D() { } -void POLYBENCH_FDTD_2D::setUp(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_FDTD_2D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_hz, m_nx * m_ny, 0.0, vid); allocAndInitData(m_ex, m_nx * m_ny, vid); @@ -103,7 +103,7 @@ void POLYBENCH_FDTD_2D::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_hz, m_nx * m_ny, checksum_scale_factor); } -void POLYBENCH_FDTD_2D::tearDown(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_FDTD_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_fict); diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp index 730a55911..edb2074f1 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp @@ -24,7 +24,7 @@ namespace polybench { -void POLYBENCH_FLOYD_WARSHALL::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_FLOYD_WARSHALL::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp index ff70686a5..6c8a9d5fa 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp @@ -35,7 +35,7 @@ namespace polybench deallocOpenMPDeviceData(pout, did); -void POLYBENCH_FLOYD_WARSHALL::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_FLOYD_WARSHALL::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp index 0c298b9b7..b9f42b0ed 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp @@ -18,7 +18,7 @@ namespace polybench { -void POLYBENCH_FLOYD_WARSHALL::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_FLOYD_WARSHALL::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp index c10cc9427..1022ffe4f 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp @@ -66,7 +66,7 @@ POLYBENCH_FLOYD_WARSHALL::~POLYBENCH_FLOYD_WARSHALL() { } -void POLYBENCH_FLOYD_WARSHALL::setUp(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_FLOYD_WARSHALL::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; allocAndInitDataRandSign(m_pin, m_N*m_N, vid); @@ -78,7 +78,7 @@ void POLYBENCH_FLOYD_WARSHALL::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_pout, m_N*m_N, checksum_scale_factor ); } -void POLYBENCH_FLOYD_WARSHALL::tearDown(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_FLOYD_WARSHALL::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_pin); diff --git a/src/polybench/POLYBENCH_GEMM-OMP.cpp b/src/polybench/POLYBENCH_GEMM-OMP.cpp index 171e60082..53bddc30c 100644 --- a/src/polybench/POLYBENCH_GEMM-OMP.cpp +++ b/src/polybench/POLYBENCH_GEMM-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_GEMM::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_GEMM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp b/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp index 37ef4fbb4..7bbf5132b 100644 --- a/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp @@ -37,7 +37,7 @@ namespace polybench deallocOpenMPDeviceData(C, did); -void POLYBENCH_GEMM::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_GEMM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_GEMM-Seq.cpp b/src/polybench/POLYBENCH_GEMM-Seq.cpp index f45741471..51a1f1127 100644 --- a/src/polybench/POLYBENCH_GEMM-Seq.cpp +++ b/src/polybench/POLYBENCH_GEMM-Seq.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_GEMM::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_GEMM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp index 8d52c7002..0ee1f41be 100644 --- a/src/polybench/POLYBENCH_GEMM.cpp +++ b/src/polybench/POLYBENCH_GEMM.cpp @@ -76,7 +76,7 @@ POLYBENCH_GEMM::~POLYBENCH_GEMM() { } -void POLYBENCH_GEMM::setUp(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_GEMM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; allocAndInitData(m_A, m_ni * m_nk, vid); @@ -89,7 +89,7 @@ void POLYBENCH_GEMM::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_C, m_ni * m_nj, checksum_scale_factor ); } -void POLYBENCH_GEMM::tearDown(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_GEMM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_A); diff --git a/src/polybench/POLYBENCH_GEMVER-OMP.cpp b/src/polybench/POLYBENCH_GEMVER-OMP.cpp index d351b26e4..18013e3f7 100644 --- a/src/polybench/POLYBENCH_GEMVER-OMP.cpp +++ b/src/polybench/POLYBENCH_GEMVER-OMP.cpp @@ -20,7 +20,7 @@ namespace polybench { -void POLYBENCH_GEMVER::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_GEMVER::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp b/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp index 78c94db76..c031bdf04 100644 --- a/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp @@ -54,7 +54,7 @@ namespace polybench -void POLYBENCH_GEMVER::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_GEMVER::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_GEMVER-Seq.cpp b/src/polybench/POLYBENCH_GEMVER-Seq.cpp index 48bd8dba8..eeee6f0ec 100644 --- a/src/polybench/POLYBENCH_GEMVER-Seq.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Seq.cpp @@ -20,7 +20,7 @@ namespace polybench { -void POLYBENCH_GEMVER::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_GEMVER::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp index 917801777..24a3f3d1b 100644 --- a/src/polybench/POLYBENCH_GEMVER.cpp +++ b/src/polybench/POLYBENCH_GEMVER.cpp @@ -85,7 +85,7 @@ POLYBENCH_GEMVER::~POLYBENCH_GEMVER() { } -void POLYBENCH_GEMVER::setUp(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_GEMVER::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; @@ -105,7 +105,7 @@ void POLYBENCH_GEMVER::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_w, m_n, checksum_scale_factor ); } -void POLYBENCH_GEMVER::tearDown(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_GEMVER::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_A); diff --git a/src/polybench/POLYBENCH_GESUMMV-OMP.cpp b/src/polybench/POLYBENCH_GESUMMV-OMP.cpp index 2e111fb4a..830bb73bf 100644 --- a/src/polybench/POLYBENCH_GESUMMV-OMP.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_GESUMMV::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_GESUMMV::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp b/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp index b5d459d25..c4c535bf6 100644 --- a/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp @@ -44,7 +44,7 @@ namespace polybench deallocOpenMPDeviceData(B, did); -void POLYBENCH_GESUMMV::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_GESUMMV::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_GESUMMV-Seq.cpp b/src/polybench/POLYBENCH_GESUMMV-Seq.cpp index bccb3fa90..c65897e5d 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Seq.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Seq.cpp @@ -18,7 +18,7 @@ namespace rajaperf namespace polybench { -void POLYBENCH_GESUMMV::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_GESUMMV::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp index 9fc65f0bd..eb527af27 100644 --- a/src/polybench/POLYBENCH_GESUMMV.cpp +++ b/src/polybench/POLYBENCH_GESUMMV.cpp @@ -65,7 +65,7 @@ POLYBENCH_GESUMMV::~POLYBENCH_GESUMMV() { } -void POLYBENCH_GESUMMV::setUp(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_GESUMMV::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; allocAndInitData(m_x, m_N, vid); @@ -79,7 +79,7 @@ void POLYBENCH_GESUMMV::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_y, m_N); } -void POLYBENCH_GESUMMV::tearDown(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_GESUMMV::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_x); diff --git a/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp b/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp index 0ea5b192f..50ca323de 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_HEAT_3D::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_HEAT_3D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp b/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp index 434b9cdeb..692689d85 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp @@ -36,7 +36,7 @@ namespace polybench deallocOpenMPDeviceData(B, did); -void POLYBENCH_HEAT_3D::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_HEAT_3D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp b/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp index a9bdaa256..4afb06d21 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_HEAT_3D::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_HEAT_3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp index 8bcc69f20..567192b9a 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D.cpp @@ -76,7 +76,7 @@ POLYBENCH_HEAT_3D::~POLYBENCH_HEAT_3D() { } -void POLYBENCH_HEAT_3D::setUp(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_HEAT_3D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; allocAndInitData(m_Ainit, m_N*m_N*m_N, vid); @@ -91,7 +91,7 @@ void POLYBENCH_HEAT_3D::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_B, m_N*m_N*m_N, checksum_scale_factor ); } -void POLYBENCH_HEAT_3D::tearDown(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_HEAT_3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_A); diff --git a/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp b/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp index 066466a27..d813f9d17 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_JACOBI_1D::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_JACOBI_1D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp b/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp index ada3047ad..1ca122ebb 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp @@ -41,7 +41,7 @@ namespace polybench deallocOpenMPDeviceData(B, did); -void POLYBENCH_JACOBI_1D::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_JACOBI_1D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp index 65f2c8036..f23ccdf06 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_JACOBI_1D::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_JACOBI_1D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp index fd3456866..f86bb5956 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp @@ -73,7 +73,7 @@ POLYBENCH_JACOBI_1D::~POLYBENCH_JACOBI_1D() { } -void POLYBENCH_JACOBI_1D::setUp(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_JACOBI_1D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; allocAndInitData(m_Ainit, m_N, vid); @@ -88,7 +88,7 @@ void POLYBENCH_JACOBI_1D::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_B, m_N, checksum_scale_factor ); } -void POLYBENCH_JACOBI_1D::tearDown(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_JACOBI_1D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_A); diff --git a/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp b/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp index 193ed398c..4acf70b25 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_JACOBI_2D::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_JACOBI_2D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp b/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp index 50af0d9f7..9538d50f7 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp @@ -36,7 +36,7 @@ namespace polybench deallocOpenMPDeviceData(B, did); -void POLYBENCH_JACOBI_2D::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_JACOBI_2D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp index addb6d3ad..856404f92 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_JACOBI_2D::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_JACOBI_2D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp index fe8972e9b..1b4f9378a 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp @@ -75,7 +75,7 @@ POLYBENCH_JACOBI_2D::~POLYBENCH_JACOBI_2D() { } -void POLYBENCH_JACOBI_2D::setUp(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_JACOBI_2D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; allocAndInitData(m_Ainit, m_N*m_N, vid); @@ -90,7 +90,7 @@ void POLYBENCH_JACOBI_2D::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_B, m_N*m_N, checksum_scale_factor ); } -void POLYBENCH_JACOBI_2D::tearDown(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_JACOBI_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_A); diff --git a/src/polybench/POLYBENCH_MVT-OMP.cpp b/src/polybench/POLYBENCH_MVT-OMP.cpp index 7d1b0454d..f5dad16b9 100644 --- a/src/polybench/POLYBENCH_MVT-OMP.cpp +++ b/src/polybench/POLYBENCH_MVT-OMP.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_MVT::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_MVT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/polybench/POLYBENCH_MVT-OMPTarget.cpp b/src/polybench/POLYBENCH_MVT-OMPTarget.cpp index 4487b0d52..acd7ad56a 100644 --- a/src/polybench/POLYBENCH_MVT-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_MVT-OMPTarget.cpp @@ -47,7 +47,7 @@ namespace polybench deallocOpenMPDeviceData(A, did); -void POLYBENCH_MVT::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_MVT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); diff --git a/src/polybench/POLYBENCH_MVT-Seq.cpp b/src/polybench/POLYBENCH_MVT-Seq.cpp index 3283dfa37..8d115b94e 100644 --- a/src/polybench/POLYBENCH_MVT-Seq.cpp +++ b/src/polybench/POLYBENCH_MVT-Seq.cpp @@ -19,7 +19,7 @@ namespace polybench { -void POLYBENCH_MVT::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_MVT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps= getRunReps(); diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp index 60a3aca47..3354ca97d 100644 --- a/src/polybench/POLYBENCH_MVT.cpp +++ b/src/polybench/POLYBENCH_MVT.cpp @@ -68,7 +68,7 @@ POLYBENCH_MVT::~POLYBENCH_MVT() { } -void POLYBENCH_MVT::setUp(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_MVT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; allocAndInitData(m_y1, m_N, vid); @@ -84,7 +84,7 @@ void POLYBENCH_MVT::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_x2, m_N, checksum_scale_factor ); } -void POLYBENCH_MVT::tearDown(VariantID vid, size_t /*tune_idx*/) +void POLYBENCH_MVT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_x1); diff --git a/src/stream/ADD-OMP.cpp b/src/stream/ADD-OMP.cpp index 3a0bae912..ae425a93f 100644 --- a/src/stream/ADD-OMP.cpp +++ b/src/stream/ADD-OMP.cpp @@ -18,7 +18,7 @@ namespace stream { -void ADD::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/stream/ADD-OMPTarget.cpp b/src/stream/ADD-OMPTarget.cpp index a97a2185a..2089472fa 100644 --- a/src/stream/ADD-OMPTarget.cpp +++ b/src/stream/ADD-OMPTarget.cpp @@ -40,7 +40,7 @@ namespace stream deallocOpenMPDeviceData(b, did); \ deallocOpenMPDeviceData(c, did); -void ADD::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void ADD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/ADD-Seq.cpp b/src/stream/ADD-Seq.cpp index 5d7daf46b..f421d44c2 100644 --- a/src/stream/ADD-Seq.cpp +++ b/src/stream/ADD-Seq.cpp @@ -18,7 +18,7 @@ namespace stream { -void ADD::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp index 6f194964e..904c0804b 100644 --- a/src/stream/ADD.cpp +++ b/src/stream/ADD.cpp @@ -58,7 +58,7 @@ ADD::~ADD() { } -void ADD::setUp(VariantID vid, size_t /*tune_idx*/) +void ADD::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitData(m_a, getActualProblemSize(), vid); allocAndInitData(m_b, getActualProblemSize(), vid); @@ -70,7 +70,7 @@ void ADD::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_c, getActualProblemSize()); } -void ADD::tearDown(VariantID vid, size_t /*tune_idx*/) +void ADD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_a); diff --git a/src/stream/COPY-OMP.cpp b/src/stream/COPY-OMP.cpp index ba9ea5b50..c1b38e25f 100644 --- a/src/stream/COPY-OMP.cpp +++ b/src/stream/COPY-OMP.cpp @@ -18,7 +18,7 @@ namespace stream { -void COPY::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void COPY::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/stream/COPY-OMPTarget.cpp b/src/stream/COPY-OMPTarget.cpp index b06760a58..823a32b13 100644 --- a/src/stream/COPY-OMPTarget.cpp +++ b/src/stream/COPY-OMPTarget.cpp @@ -39,7 +39,7 @@ namespace stream deallocOpenMPDeviceData(c, did); -void COPY::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void COPY::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/COPY-Seq.cpp b/src/stream/COPY-Seq.cpp index 371921c03..a807c0bee 100644 --- a/src/stream/COPY-Seq.cpp +++ b/src/stream/COPY-Seq.cpp @@ -18,7 +18,7 @@ namespace stream { -void COPY::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void COPY::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp index bef39fabe..251208a4d 100644 --- a/src/stream/COPY.cpp +++ b/src/stream/COPY.cpp @@ -58,7 +58,7 @@ COPY::~COPY() { } -void COPY::setUp(VariantID vid, size_t /*tune_idx*/) +void COPY::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitData(m_a, getActualProblemSize(), vid); allocAndInitDataConst(m_c, getActualProblemSize(), 0.0, vid); @@ -69,7 +69,7 @@ void COPY::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_c, getActualProblemSize()); } -void COPY::tearDown(VariantID vid, size_t /*tune_idx*/) +void COPY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_a); diff --git a/src/stream/DOT-OMP.cpp b/src/stream/DOT-OMP.cpp index dee61554f..efd8e9ffa 100644 --- a/src/stream/DOT-OMP.cpp +++ b/src/stream/DOT-OMP.cpp @@ -18,7 +18,7 @@ namespace stream { -void DOT::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void DOT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/stream/DOT-OMPTarget.cpp b/src/stream/DOT-OMPTarget.cpp index f579e1901..27f96a5d1 100644 --- a/src/stream/DOT-OMPTarget.cpp +++ b/src/stream/DOT-OMPTarget.cpp @@ -37,7 +37,7 @@ namespace stream deallocOpenMPDeviceData(a, did); \ deallocOpenMPDeviceData(b, did); -void DOT::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void DOT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/DOT-Seq.cpp b/src/stream/DOT-Seq.cpp index 7c84bcbe9..cde2263de 100644 --- a/src/stream/DOT-Seq.cpp +++ b/src/stream/DOT-Seq.cpp @@ -18,7 +18,7 @@ namespace stream { -void DOT::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void DOT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp index 4534463a5..0d9657a8a 100644 --- a/src/stream/DOT.cpp +++ b/src/stream/DOT.cpp @@ -58,7 +58,7 @@ DOT::~DOT() { } -void DOT::setUp(VariantID vid, size_t /*tune_idx*/) +void DOT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitData(m_a, getActualProblemSize(), vid); allocAndInitData(m_b, getActualProblemSize(), vid); @@ -72,7 +72,7 @@ void DOT::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += m_dot; } -void DOT::tearDown(VariantID vid, size_t /*tune_idx*/) +void DOT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_a); diff --git a/src/stream/MUL-OMP.cpp b/src/stream/MUL-OMP.cpp index a82a1bf64..7b78bf819 100644 --- a/src/stream/MUL-OMP.cpp +++ b/src/stream/MUL-OMP.cpp @@ -18,7 +18,7 @@ namespace stream { -void MUL::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void MUL::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/stream/MUL-OMPTarget.cpp b/src/stream/MUL-OMPTarget.cpp index 55f4d7f8b..7e3141c78 100644 --- a/src/stream/MUL-OMPTarget.cpp +++ b/src/stream/MUL-OMPTarget.cpp @@ -38,7 +38,7 @@ namespace stream deallocOpenMPDeviceData(b, did); \ deallocOpenMPDeviceData(c, did); -void MUL::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void MUL::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/MUL-Seq.cpp b/src/stream/MUL-Seq.cpp index 8e7569a76..837d26147 100644 --- a/src/stream/MUL-Seq.cpp +++ b/src/stream/MUL-Seq.cpp @@ -18,7 +18,7 @@ namespace stream { -void MUL::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void MUL::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp index eb3d917be..55eced2b0 100644 --- a/src/stream/MUL.cpp +++ b/src/stream/MUL.cpp @@ -58,7 +58,7 @@ MUL::~MUL() { } -void MUL::setUp(VariantID vid, size_t /*tune_idx*/) +void MUL::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_b, getActualProblemSize(), 0.0, vid); allocAndInitData(m_c, getActualProblemSize(), vid); @@ -70,7 +70,7 @@ void MUL::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_b, getActualProblemSize()); } -void MUL::tearDown(VariantID vid, size_t /*tune_idx*/) +void MUL::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_b); diff --git a/src/stream/TRIAD-OMP.cpp b/src/stream/TRIAD-OMP.cpp index ca86faf6b..f1c5c435d 100644 --- a/src/stream/TRIAD-OMP.cpp +++ b/src/stream/TRIAD-OMP.cpp @@ -18,7 +18,7 @@ namespace stream { -void TRIAD::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void TRIAD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/stream/TRIAD-OMPTarget.cpp b/src/stream/TRIAD-OMPTarget.cpp index d38adede3..c69e6cdbb 100644 --- a/src/stream/TRIAD-OMPTarget.cpp +++ b/src/stream/TRIAD-OMPTarget.cpp @@ -40,7 +40,7 @@ namespace stream deallocOpenMPDeviceData(b, did); \ deallocOpenMPDeviceData(c, did); -void TRIAD::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void TRIAD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/TRIAD-Seq.cpp b/src/stream/TRIAD-Seq.cpp index e2d759959..0477202c0 100644 --- a/src/stream/TRIAD-Seq.cpp +++ b/src/stream/TRIAD-Seq.cpp @@ -18,7 +18,7 @@ namespace stream { -void TRIAD::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void TRIAD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp index c0b143872..543b19642 100644 --- a/src/stream/TRIAD.cpp +++ b/src/stream/TRIAD.cpp @@ -62,7 +62,7 @@ TRIAD::~TRIAD() { } -void TRIAD::setUp(VariantID vid, size_t /*tune_idx*/) +void TRIAD::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_a, getActualProblemSize(), 0.0, vid); allocAndInitData(m_b, getActualProblemSize(), vid); @@ -75,7 +75,7 @@ void TRIAD::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_a, getActualProblemSize(), checksum_scale_factor ); } -void TRIAD::tearDown(VariantID vid, size_t /*tune_idx*/) +void TRIAD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_a); From 47b84350adaa089a4f1a442cc85ab1c8047cd419 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Mon, 4 Apr 2022 09:19:36 -0700 Subject: [PATCH 299/392] update RAJA to slightly ahead of develop - include RAJA macro rework --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index 4351fe6a5..bd6f2fbe8 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 4351fe6a50bd579511a625b017c9e054885e7fd2 +Subproject commit bd6f2fbe8d94a8893d98ee0602c9e154969ed1f2 From 0b5f3858c80b8e58374ec6ce75412e224f0963c6 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 4 Apr 2022 13:24:33 -0700 Subject: [PATCH 300/392] Use default gpu block size in sort --- src/algorithm/SORT-Cuda.cpp | 8 +------- src/algorithm/SORT-Hip.cpp | 8 +------- src/algorithm/SORTPAIRS-Cuda.cpp | 8 +------- src/algorithm/SORTPAIRS-Hip.cpp | 8 +------- 4 files changed, 4 insertions(+), 28 deletions(-) diff --git a/src/algorithm/SORT-Cuda.cpp b/src/algorithm/SORT-Cuda.cpp index 1c7a67381..599a9f246 100644 --- a/src/algorithm/SORT-Cuda.cpp +++ b/src/algorithm/SORT-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace algorithm { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define SORT_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(x, m_x, iend*run_reps); @@ -50,7 +44,7 @@ void SORT::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::sort< RAJA::cuda_exec >(RAJA_SORT_ARGS); + RAJA::sort< RAJA::cuda_exec >(RAJA_SORT_ARGS); } stopTimer(); diff --git a/src/algorithm/SORT-Hip.cpp b/src/algorithm/SORT-Hip.cpp index 80f173a00..6c80fbc97 100644 --- a/src/algorithm/SORT-Hip.cpp +++ b/src/algorithm/SORT-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace algorithm { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define SORT_DATA_SETUP_HIP \ allocAndInitHipDeviceData(x, m_x, iend*run_reps); @@ -50,7 +44,7 @@ void SORT::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::sort< RAJA::hip_exec >(RAJA_SORT_ARGS); + RAJA::sort< RAJA::hip_exec >(RAJA_SORT_ARGS); } stopTimer(); diff --git a/src/algorithm/SORTPAIRS-Cuda.cpp b/src/algorithm/SORTPAIRS-Cuda.cpp index 9e2c1ec93..0c09bfe1f 100644 --- a/src/algorithm/SORTPAIRS-Cuda.cpp +++ b/src/algorithm/SORTPAIRS-Cuda.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace algorithm { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define SORTPAIRS_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(x, m_x, iend*run_reps); \ allocAndInitCudaDeviceData(i, m_i, iend*run_reps); @@ -53,7 +47,7 @@ void SORTPAIRS::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_id startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::sort_pairs< RAJA::cuda_exec >(RAJA_SORTPAIRS_ARGS); + RAJA::sort_pairs< RAJA::cuda_exec >(RAJA_SORTPAIRS_ARGS); } stopTimer(); diff --git a/src/algorithm/SORTPAIRS-Hip.cpp b/src/algorithm/SORTPAIRS-Hip.cpp index f927a15d9..fbdbc660d 100644 --- a/src/algorithm/SORTPAIRS-Hip.cpp +++ b/src/algorithm/SORTPAIRS-Hip.cpp @@ -21,12 +21,6 @@ namespace rajaperf namespace algorithm { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define SORTPAIRS_DATA_SETUP_HIP \ allocAndInitHipDeviceData(x, m_x, iend*run_reps); \ allocAndInitHipDeviceData(i, m_i, iend*run_reps); @@ -53,7 +47,7 @@ void SORTPAIRS::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::sort_pairs< RAJA::hip_exec >(RAJA_SORTPAIRS_ARGS); + RAJA::sort_pairs< RAJA::hip_exec >(RAJA_SORTPAIRS_ARGS); } stopTimer(); From 01a1c67654cd4857ab5e9a77e85f0db52bbaa8a5 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 4 Apr 2022 13:26:31 -0700 Subject: [PATCH 301/392] Rename gpu block size list_type This allows me to use the list type directly and makes it less confusing what is happening when you make the list. --- src/apps/DEL_DOT_VEC_2D.hpp | 2 +- src/apps/ENERGY.hpp | 2 +- src/apps/FIR.hpp | 2 +- src/apps/HALOEXCHANGE.hpp | 2 +- src/apps/HALOEXCHANGE_FUSED.hpp | 2 +- src/apps/LTIMES.hpp | 2 +- src/apps/LTIMES_NOVIEW.hpp | 2 +- src/apps/NODAL_ACCUMULATION_3D.hpp | 2 +- src/apps/PRESSURE.hpp | 2 +- src/apps/VOL3D.hpp | 2 +- src/basic/DAXPY.hpp | 2 +- src/basic/DAXPY_ATOMIC.hpp | 2 +- src/basic/IF_QUAD.hpp | 2 +- src/basic/INIT3.hpp | 2 +- src/basic/INIT_VIEW1D.hpp | 2 +- src/basic/INIT_VIEW1D_OFFSET.hpp | 2 +- src/basic/MAT_MAT_SHARED.hpp | 2 +- src/basic/MULADDSUB.hpp | 2 +- src/basic/NESTED_INIT.hpp | 2 +- src/basic/PI_ATOMIC.hpp | 2 +- src/basic/PI_REDUCE.hpp | 2 +- src/basic/REDUCE3_INT.hpp | 2 +- src/basic/TRAP_INT.hpp | 2 +- src/common/GPUUtils.hpp | 15 +++++++++------ src/lcals/DIFF_PREDICT.hpp | 2 +- src/lcals/EOS.hpp | 2 +- src/lcals/FIRST_DIFF.hpp | 2 +- src/lcals/FIRST_MIN.hpp | 2 +- src/lcals/FIRST_SUM.hpp | 2 +- src/lcals/GEN_LIN_RECUR.hpp | 2 +- src/lcals/HYDRO_1D.hpp | 2 +- src/lcals/HYDRO_2D.hpp | 2 +- src/lcals/INT_PREDICT.hpp | 2 +- src/lcals/PLANCKIAN.hpp | 2 +- src/lcals/TRIDIAG_ELIM.hpp | 2 +- src/polybench/POLYBENCH_2MM.hpp | 2 +- src/polybench/POLYBENCH_3MM.hpp | 2 +- src/polybench/POLYBENCH_ADI.hpp | 2 +- src/polybench/POLYBENCH_ATAX.hpp | 2 +- src/polybench/POLYBENCH_FDTD_2D.hpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp | 2 +- src/polybench/POLYBENCH_GEMM.hpp | 2 +- src/polybench/POLYBENCH_GEMVER.hpp | 2 +- src/polybench/POLYBENCH_GESUMMV.hpp | 2 +- src/polybench/POLYBENCH_HEAT_3D.hpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D.hpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D.hpp | 2 +- src/polybench/POLYBENCH_MVT.hpp | 2 +- src/stream/ADD.hpp | 2 +- src/stream/COPY.hpp | 2 +- src/stream/DOT.hpp | 2 +- src/stream/MUL.hpp | 2 +- src/stream/TRIAD.hpp | 2 +- 53 files changed, 61 insertions(+), 58 deletions(-) diff --git a/src/apps/DEL_DOT_VEC_2D.hpp b/src/apps/DEL_DOT_VEC_2D.hpp index f75491d63..60d577a05 100644 --- a/src/apps/DEL_DOT_VEC_2D.hpp +++ b/src/apps/DEL_DOT_VEC_2D.hpp @@ -123,7 +123,7 @@ class DEL_DOT_VEC_2D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/apps/ENERGY.hpp b/src/apps/ENERGY.hpp index d2fcc9d88..6461fdd5f 100644 --- a/src/apps/ENERGY.hpp +++ b/src/apps/ENERGY.hpp @@ -213,7 +213,7 @@ class ENERGY : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_e_new; Real_ptr m_e_old; diff --git a/src/apps/FIR.hpp b/src/apps/FIR.hpp index 0d643582b..dd46d9934 100644 --- a/src/apps/FIR.hpp +++ b/src/apps/FIR.hpp @@ -88,7 +88,7 @@ class FIR : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_in; Real_ptr m_out; diff --git a/src/apps/HALOEXCHANGE.hpp b/src/apps/HALOEXCHANGE.hpp index 705eed88c..5d653762a 100644 --- a/src/apps/HALOEXCHANGE.hpp +++ b/src/apps/HALOEXCHANGE.hpp @@ -103,7 +103,7 @@ class HALOEXCHANGE : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; static const int s_num_neighbors = 26; diff --git a/src/apps/HALOEXCHANGE_FUSED.hpp b/src/apps/HALOEXCHANGE_FUSED.hpp index dcb8a701a..e47c1e14e 100644 --- a/src/apps/HALOEXCHANGE_FUSED.hpp +++ b/src/apps/HALOEXCHANGE_FUSED.hpp @@ -147,7 +147,7 @@ class HALOEXCHANGE_FUSED : public KernelBase private: static const size_t default_gpu_block_size = 1024; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; static const int s_num_neighbors = 26; diff --git a/src/apps/LTIMES.hpp b/src/apps/LTIMES.hpp index a8d488a93..31eae0f83 100644 --- a/src/apps/LTIMES.hpp +++ b/src/apps/LTIMES.hpp @@ -126,7 +126,7 @@ class LTIMES : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type>; Real_ptr m_phidat; diff --git a/src/apps/LTIMES_NOVIEW.hpp b/src/apps/LTIMES_NOVIEW.hpp index 98f5733f6..1385864fb 100644 --- a/src/apps/LTIMES_NOVIEW.hpp +++ b/src/apps/LTIMES_NOVIEW.hpp @@ -76,7 +76,7 @@ class LTIMES_NOVIEW : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type>; Real_ptr m_phidat; diff --git a/src/apps/NODAL_ACCUMULATION_3D.hpp b/src/apps/NODAL_ACCUMULATION_3D.hpp index 049ca1e78..a574f331a 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.hpp +++ b/src/apps/NODAL_ACCUMULATION_3D.hpp @@ -105,7 +105,7 @@ class NODAL_ACCUMULATION_3D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_x; Real_ptr m_vol; diff --git a/src/apps/PRESSURE.hpp b/src/apps/PRESSURE.hpp index 129972d46..6421ce6b0 100644 --- a/src/apps/PRESSURE.hpp +++ b/src/apps/PRESSURE.hpp @@ -82,7 +82,7 @@ class PRESSURE : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_compression; Real_ptr m_bvc; diff --git a/src/apps/VOL3D.hpp b/src/apps/VOL3D.hpp index e11b3caf1..9ddedbd19 100644 --- a/src/apps/VOL3D.hpp +++ b/src/apps/VOL3D.hpp @@ -179,7 +179,7 @@ class VOL3D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/basic/DAXPY.hpp b/src/basic/DAXPY.hpp index 1d6e3b61b..db8501e9f 100644 --- a/src/basic/DAXPY.hpp +++ b/src/basic/DAXPY.hpp @@ -62,7 +62,7 @@ class DAXPY : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/basic/DAXPY_ATOMIC.hpp b/src/basic/DAXPY_ATOMIC.hpp index 4ea51cb48..909939a45 100644 --- a/src/basic/DAXPY_ATOMIC.hpp +++ b/src/basic/DAXPY_ATOMIC.hpp @@ -65,7 +65,7 @@ class DAXPY_ATOMIC : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/basic/IF_QUAD.hpp b/src/basic/IF_QUAD.hpp index eb949b510..4d2a22c22 100644 --- a/src/basic/IF_QUAD.hpp +++ b/src/basic/IF_QUAD.hpp @@ -79,7 +79,7 @@ class IF_QUAD : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_a; Real_ptr m_b; diff --git a/src/basic/INIT3.hpp b/src/basic/INIT3.hpp index a53f105a9..44f3622de 100644 --- a/src/basic/INIT3.hpp +++ b/src/basic/INIT3.hpp @@ -65,7 +65,7 @@ class INIT3 : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_out1; Real_ptr m_out2; diff --git a/src/basic/INIT_VIEW1D.hpp b/src/basic/INIT_VIEW1D.hpp index 00b9653ab..b51d38b79 100644 --- a/src/basic/INIT_VIEW1D.hpp +++ b/src/basic/INIT_VIEW1D.hpp @@ -76,7 +76,7 @@ class INIT_VIEW1D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_a; Real_type m_val; diff --git a/src/basic/INIT_VIEW1D_OFFSET.hpp b/src/basic/INIT_VIEW1D_OFFSET.hpp index e7af48895..be597496d 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.hpp +++ b/src/basic/INIT_VIEW1D_OFFSET.hpp @@ -75,7 +75,7 @@ class INIT_VIEW1D_OFFSET : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_a; Real_type m_val; diff --git a/src/basic/MAT_MAT_SHARED.hpp b/src/basic/MAT_MAT_SHARED.hpp index a4bf9c443..095721c27 100644 --- a/src/basic/MAT_MAT_SHARED.hpp +++ b/src/basic/MAT_MAT_SHARED.hpp @@ -149,7 +149,7 @@ class MAT_MAT_SHARED : public KernelBase { private: static const size_t default_gpu_block_size = TL_SZ * TL_SZ; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_A; Real_ptr m_B; diff --git a/src/basic/MULADDSUB.hpp b/src/basic/MULADDSUB.hpp index 5cf36f996..30ad11a54 100644 --- a/src/basic/MULADDSUB.hpp +++ b/src/basic/MULADDSUB.hpp @@ -68,7 +68,7 @@ class MULADDSUB : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_out1; Real_ptr m_out2; diff --git a/src/basic/NESTED_INIT.hpp b/src/basic/NESTED_INIT.hpp index a2213abcc..13da52cf2 100644 --- a/src/basic/NESTED_INIT.hpp +++ b/src/basic/NESTED_INIT.hpp @@ -68,7 +68,7 @@ class NESTED_INIT : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type>; Index_type m_array_length; diff --git a/src/basic/PI_ATOMIC.hpp b/src/basic/PI_ATOMIC.hpp index 67c6a29c2..10c674dda 100644 --- a/src/basic/PI_ATOMIC.hpp +++ b/src/basic/PI_ATOMIC.hpp @@ -64,7 +64,7 @@ class PI_ATOMIC : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_type m_dx; Real_ptr m_pi; diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp index 901b9959d..c7cc3258a 100644 --- a/src/basic/PI_REDUCE.hpp +++ b/src/basic/PI_REDUCE.hpp @@ -66,7 +66,7 @@ class PI_REDUCE : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_type m_dx; Real_type m_pi; diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp index 1f13e457b..93ad766c2 100644 --- a/src/basic/REDUCE3_INT.hpp +++ b/src/basic/REDUCE3_INT.hpp @@ -80,7 +80,7 @@ class REDUCE3_INT : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Int_ptr m_vec; Int_type m_vsum; diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp index f2e714f09..50acfeb79 100644 --- a/src/basic/TRAP_INT.hpp +++ b/src/basic/TRAP_INT.hpp @@ -77,7 +77,7 @@ class TRAP_INT : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_type m_x0; Real_type m_xp; diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 97acbba0e..76362ee1c 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -138,18 +138,21 @@ struct ExactSqrt static constexpr bool valid() { return sqrt(I)*sqrt(I) == I; } }; +template < size_t... block_sizes > +using list_type = camp::int_seq; + // A camp::int_seq of size_t's that is rajaperf::configuration::gpu_block_sizes // if rajaperf::configuration::gpu_block_sizes is not empty // and a camp::int_seq of default_block_size otherwise // with invalid entries removed according to validity_checker template < size_t default_block_size, typename validity_checker = AllowAny > -using list_type = +using make_list_type = typename detail::remove_invalid::size > 0), - rajaperf::configuration::gpu_block_sizes, - camp::int_seq - >::type - >::type; + typename std::conditional< (detail::SizeOfIntSeq::size > 0), + rajaperf::configuration::gpu_block_sizes, + list_type + >::type + >::type; } // closing brace for gpu_block_size namespace diff --git a/src/lcals/DIFF_PREDICT.hpp b/src/lcals/DIFF_PREDICT.hpp index f28054f40..130071412 100644 --- a/src/lcals/DIFF_PREDICT.hpp +++ b/src/lcals/DIFF_PREDICT.hpp @@ -103,7 +103,7 @@ class DIFF_PREDICT : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_px; Real_ptr m_cx; diff --git a/src/lcals/EOS.hpp b/src/lcals/EOS.hpp index efe36163a..f2d38b5e9 100644 --- a/src/lcals/EOS.hpp +++ b/src/lcals/EOS.hpp @@ -72,7 +72,7 @@ class EOS : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/lcals/FIRST_DIFF.hpp b/src/lcals/FIRST_DIFF.hpp index 9f31800d7..51de73049 100644 --- a/src/lcals/FIRST_DIFF.hpp +++ b/src/lcals/FIRST_DIFF.hpp @@ -62,7 +62,7 @@ class FIRST_DIFF : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/lcals/FIRST_MIN.hpp b/src/lcals/FIRST_MIN.hpp index 9cd7dd25f..c10839ec7 100644 --- a/src/lcals/FIRST_MIN.hpp +++ b/src/lcals/FIRST_MIN.hpp @@ -91,7 +91,7 @@ class FIRST_MIN : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_x; Real_type m_xmin_init; diff --git a/src/lcals/FIRST_SUM.hpp b/src/lcals/FIRST_SUM.hpp index 4889148c7..5f019c08c 100644 --- a/src/lcals/FIRST_SUM.hpp +++ b/src/lcals/FIRST_SUM.hpp @@ -65,7 +65,7 @@ class FIRST_SUM : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/lcals/GEN_LIN_RECUR.hpp b/src/lcals/GEN_LIN_RECUR.hpp index 8ed1d3073..d6d20b43b 100644 --- a/src/lcals/GEN_LIN_RECUR.hpp +++ b/src/lcals/GEN_LIN_RECUR.hpp @@ -86,7 +86,7 @@ class GEN_LIN_RECUR : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_b5; Real_ptr m_sa; diff --git a/src/lcals/HYDRO_1D.hpp b/src/lcals/HYDRO_1D.hpp index db5d8f8cd..692e40a8e 100644 --- a/src/lcals/HYDRO_1D.hpp +++ b/src/lcals/HYDRO_1D.hpp @@ -67,7 +67,7 @@ class HYDRO_1D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/lcals/HYDRO_2D.hpp b/src/lcals/HYDRO_2D.hpp index 46892698c..4363ea633 100644 --- a/src/lcals/HYDRO_2D.hpp +++ b/src/lcals/HYDRO_2D.hpp @@ -163,7 +163,7 @@ class HYDRO_2D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type>; Real_ptr m_za; diff --git a/src/lcals/INT_PREDICT.hpp b/src/lcals/INT_PREDICT.hpp index 0adfaac47..7a3c6fda6 100644 --- a/src/lcals/INT_PREDICT.hpp +++ b/src/lcals/INT_PREDICT.hpp @@ -82,7 +82,7 @@ class INT_PREDICT : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Index_type m_array_length; Index_type m_offset; diff --git a/src/lcals/PLANCKIAN.hpp b/src/lcals/PLANCKIAN.hpp index 632d11b86..46fba63db 100644 --- a/src/lcals/PLANCKIAN.hpp +++ b/src/lcals/PLANCKIAN.hpp @@ -67,7 +67,7 @@ class PLANCKIAN : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/lcals/TRIDIAG_ELIM.hpp b/src/lcals/TRIDIAG_ELIM.hpp index fe44df691..f593985a5 100644 --- a/src/lcals/TRIDIAG_ELIM.hpp +++ b/src/lcals/TRIDIAG_ELIM.hpp @@ -67,7 +67,7 @@ class TRIDIAG_ELIM : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_xout; Real_ptr m_xin; diff --git a/src/polybench/POLYBENCH_2MM.hpp b/src/polybench/POLYBENCH_2MM.hpp index 27993c3f7..0624257f7 100644 --- a/src/polybench/POLYBENCH_2MM.hpp +++ b/src/polybench/POLYBENCH_2MM.hpp @@ -137,7 +137,7 @@ class POLYBENCH_2MM : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type>; Index_type m_ni; diff --git a/src/polybench/POLYBENCH_3MM.hpp b/src/polybench/POLYBENCH_3MM.hpp index df4734df0..0cf9aabff 100644 --- a/src/polybench/POLYBENCH_3MM.hpp +++ b/src/polybench/POLYBENCH_3MM.hpp @@ -163,7 +163,7 @@ class POLYBENCH_3MM : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type>; Index_type m_ni; diff --git a/src/polybench/POLYBENCH_ADI.hpp b/src/polybench/POLYBENCH_ADI.hpp index aafc643f9..7cd579964 100644 --- a/src/polybench/POLYBENCH_ADI.hpp +++ b/src/polybench/POLYBENCH_ADI.hpp @@ -205,7 +205,7 @@ class POLYBENCH_ADI : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Index_type m_n; Index_type m_tsteps; diff --git a/src/polybench/POLYBENCH_ATAX.hpp b/src/polybench/POLYBENCH_ATAX.hpp index eeb641b28..8f28a1470 100644 --- a/src/polybench/POLYBENCH_ATAX.hpp +++ b/src/polybench/POLYBENCH_ATAX.hpp @@ -125,7 +125,7 @@ class POLYBENCH_ATAX : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Index_type m_N; Real_ptr m_tmp; diff --git a/src/polybench/POLYBENCH_FDTD_2D.hpp b/src/polybench/POLYBENCH_FDTD_2D.hpp index c9133c4a6..7d3696293 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.hpp +++ b/src/polybench/POLYBENCH_FDTD_2D.hpp @@ -123,7 +123,7 @@ class POLYBENCH_FDTD_2D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type>; Index_type m_nx; diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp index 261d7818b..283231d29 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp @@ -86,7 +86,7 @@ class POLYBENCH_FLOYD_WARSHALL : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type>; Index_type m_N; diff --git a/src/polybench/POLYBENCH_GEMM.hpp b/src/polybench/POLYBENCH_GEMM.hpp index 92c400583..ae218397d 100644 --- a/src/polybench/POLYBENCH_GEMM.hpp +++ b/src/polybench/POLYBENCH_GEMM.hpp @@ -109,7 +109,7 @@ class POLYBENCH_GEMM : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type>; Index_type m_ni; diff --git a/src/polybench/POLYBENCH_GEMVER.hpp b/src/polybench/POLYBENCH_GEMVER.hpp index 97a6c6d2c..80c96fa94 100644 --- a/src/polybench/POLYBENCH_GEMVER.hpp +++ b/src/polybench/POLYBENCH_GEMVER.hpp @@ -162,7 +162,7 @@ class POLYBENCH_GEMVER : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type>; Index_type m_n; diff --git a/src/polybench/POLYBENCH_GESUMMV.hpp b/src/polybench/POLYBENCH_GESUMMV.hpp index 5139393ed..c8f71ee84 100644 --- a/src/polybench/POLYBENCH_GESUMMV.hpp +++ b/src/polybench/POLYBENCH_GESUMMV.hpp @@ -108,7 +108,7 @@ class POLYBENCH_GESUMMV : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Index_type m_N; diff --git a/src/polybench/POLYBENCH_HEAT_3D.hpp b/src/polybench/POLYBENCH_HEAT_3D.hpp index 04d8e7b38..81ab06e0e 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.hpp +++ b/src/polybench/POLYBENCH_HEAT_3D.hpp @@ -134,7 +134,7 @@ class POLYBENCH_HEAT_3D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type>; Index_type m_N; diff --git a/src/polybench/POLYBENCH_JACOBI_1D.hpp b/src/polybench/POLYBENCH_JACOBI_1D.hpp index b26a8248f..cb3131490 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.hpp @@ -80,7 +80,7 @@ class POLYBENCH_JACOBI_1D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Index_type m_N; Index_type m_tsteps; diff --git a/src/polybench/POLYBENCH_JACOBI_2D.hpp b/src/polybench/POLYBENCH_JACOBI_2D.hpp index 3c8611e81..a2ba63181 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.hpp @@ -100,7 +100,7 @@ class POLYBENCH_JACOBI_2D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type>; Index_type m_N; diff --git a/src/polybench/POLYBENCH_MVT.hpp b/src/polybench/POLYBENCH_MVT.hpp index 8a397f8dd..dce40baf2 100644 --- a/src/polybench/POLYBENCH_MVT.hpp +++ b/src/polybench/POLYBENCH_MVT.hpp @@ -122,7 +122,7 @@ class POLYBENCH_MVT : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Index_type m_N; Real_ptr m_x1; diff --git a/src/stream/ADD.hpp b/src/stream/ADD.hpp index 3f64ff855..07d0dea79 100644 --- a/src/stream/ADD.hpp +++ b/src/stream/ADD.hpp @@ -62,7 +62,7 @@ class ADD : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_a; Real_ptr m_b; diff --git a/src/stream/COPY.hpp b/src/stream/COPY.hpp index 3e9f3569f..0f23bfa68 100644 --- a/src/stream/COPY.hpp +++ b/src/stream/COPY.hpp @@ -61,7 +61,7 @@ class COPY : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_a; Real_ptr m_c; diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp index 383d1e07e..64d70c630 100644 --- a/src/stream/DOT.hpp +++ b/src/stream/DOT.hpp @@ -61,7 +61,7 @@ class DOT : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_a; Real_ptr m_b; diff --git a/src/stream/MUL.hpp b/src/stream/MUL.hpp index 5cb9075a6..1e79e17f9 100644 --- a/src/stream/MUL.hpp +++ b/src/stream/MUL.hpp @@ -62,7 +62,7 @@ class MUL : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_b; Real_ptr m_c; diff --git a/src/stream/TRIAD.hpp b/src/stream/TRIAD.hpp index 6a067f708..80685ce3c 100644 --- a/src/stream/TRIAD.hpp +++ b/src/stream/TRIAD.hpp @@ -63,7 +63,7 @@ class TRIAD : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_a; Real_ptr m_b; From ff410acaa7ef130adab23b33c04e6102e36781aa Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 4 Apr 2022 13:46:42 -0700 Subject: [PATCH 302/392] Use gpu block size infrastructure in more kernels This makes it so the gpu block size is always in the tuning name if it is known and default is only used when the block size is determined by a library. --- src/apps/DIFFUSION3DPA-Cuda.cpp | 7 +++++-- src/apps/DIFFUSION3DPA-Hip.cpp | 13 ++++++++----- src/apps/DIFFUSION3DPA.hpp | 8 ++++++++ src/apps/MASS3DPA-Cuda.cpp | 7 +++++-- src/apps/MASS3DPA-Hip.cpp | 11 +++++++---- src/apps/MASS3DPA.hpp | 8 ++++++++ 6 files changed, 41 insertions(+), 13 deletions(-) diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index 47ddf66f9..9ceafb94c 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -120,7 +120,8 @@ __global__ void Diffusion3DPA(const Real_ptr Basis, } } -void DIFFUSION3DPA::runCudaVariant(VariantID vid, size_t tune_idx) { +template < size_t block_size > +void DIFFUSION3DPA::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); DIFFUSION3DPA_DATA_SETUP; @@ -136,7 +137,7 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid, size_t tune_idx) { dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, DPA_Q1D); - Diffusion3DPA<<>>( + Diffusion3DPA<<>>( Basis, dBasis, D, X, Y, symmetric); cudaErrchk(cudaGetLastError()); @@ -365,6 +366,8 @@ void DIFFUSION3DPA::runCudaVariant(VariantID vid, size_t tune_idx) { } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DIFFUSION3DPA, Cuda) + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index 8dd03a3ae..58a40a77e 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -120,7 +120,8 @@ __global__ void Diffusion3DPA(const Real_ptr Basis, } } -void DIFFUSION3DPA::runHipVariant(VariantID vid, size_t tune_idx) { +template < size_t block_size > +void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); DIFFUSION3DPA_DATA_SETUP; @@ -131,14 +132,14 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid, size_t tune_idx) { DIFFUSION3DPA_DATA_SETUP_HIP; - dim3 grid_size(NE); - dim3 block_size(DPA_Q1D, DPA_Q1D, DPA_Q1D); + dim3 nblocks(NE); + dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, DPA_Q1D); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipLaunchKernelGGL((Diffusion3DPA), - dim3(grid_size), dim3(block_size), 0, 0, + hipLaunchKernelGGL((Diffusion3DPA), + dim3(nblocks), dim3(nthreads_per_block), 0, 0, Basis, dBasis, D, X, Y, symmetric); hipErrchk(hipGetLastError()); @@ -367,6 +368,8 @@ void DIFFUSION3DPA::runHipVariant(VariantID vid, size_t tune_idx) { } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DIFFUSION3DPA, Hip) + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index b5dfc7cec..b0ba7c977 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -482,8 +482,16 @@ class DIFFUSION3DPA : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: static const size_t default_gpu_block_size = DPA_Q1D * DPA_Q1D * DPA_Q1D; + using gpu_block_sizes_type = gpu_block_size::list_type; Real_ptr m_B; Real_ptr m_Bt; diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp index 3f3011fd3..b872a2a3c 100644 --- a/src/apps/MASS3DPA-Cuda.cpp +++ b/src/apps/MASS3DPA-Cuda.cpp @@ -102,7 +102,8 @@ __global__ void Mass3DPA(const Real_ptr B, const Real_ptr Bt, } } -void MASS3DPA::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { +template < size_t block_size > +void MASS3DPA::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); MASS3DPA_DATA_SETUP; @@ -118,7 +119,7 @@ void MASS3DPA::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx dim3 nthreads_per_block(MPA_Q1D, MPA_Q1D, 1); - Mass3DPA<<>>(B, Bt, D, X, Y); + Mass3DPA<<>>(B, Bt, D, X, Y); cudaErrchk( cudaGetLastError() ); } @@ -278,6 +279,8 @@ void MASS3DPA::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MASS3DPA, Cuda) + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index 01ffead73..804a858fa 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -102,7 +102,8 @@ __global__ void Mass3DPA(const Real_ptr B, const Real_ptr Bt, } } -void MASS3DPA::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { +template < size_t block_size > +void MASS3DPA::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); MASS3DPA_DATA_SETUP; @@ -113,13 +114,13 @@ void MASS3DPA::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) MASS3DPA_DATA_SETUP_HIP; - dim3 grid_size(NE); - dim3 block_size(MPA_Q1D, MPA_Q1D, 1); + dim3 nblocks(NE); + dim3 nthreads_per_block(MPA_Q1D, MPA_Q1D, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipLaunchKernelGGL((Mass3DPA), dim3(grid_size), dim3(block_size), 0, 0, + hipLaunchKernelGGL((Mass3DPA), dim3(nblocks), dim3(nthreads_per_block), 0, 0, B, Bt, D, X, Y); hipErrchk( hipGetLastError() ); @@ -280,6 +281,8 @@ void MASS3DPA::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MASS3DPA, Hip) + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/MASS3DPA.hpp b/src/apps/MASS3DPA.hpp index 24a3c5baf..0d1c3a42d 100644 --- a/src/apps/MASS3DPA.hpp +++ b/src/apps/MASS3DPA.hpp @@ -364,8 +364,16 @@ class MASS3DPA : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: static const size_t default_gpu_block_size = MPA_Q1D * MPA_Q1D; + using gpu_block_sizes_type = gpu_block_size::list_type; Real_ptr m_B; Real_ptr m_Bt; From 7d2bdbf0e169c70d9bd5fd5c1617910e6a8bace6 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 4 Apr 2022 14:17:56 -0700 Subject: [PATCH 303/392] Use UNUSED_ARG for tune_idx --- src/algorithm/SCAN-Cuda.cpp | 2 +- src/algorithm/SCAN-Hip.cpp | 2 +- src/algorithm/SCAN-OMP.cpp | 2 +- src/algorithm/SCAN-OMPTarget.cpp | 2 +- src/algorithm/SCAN-Seq.cpp | 2 +- src/basic/INDEXLIST-OMP.cpp | 2 +- src/basic/INDEXLIST-OMPTarget.cpp | 2 +- src/basic/INDEXLIST-Seq.cpp | 2 +- src/basic/INDEXLIST_3LOOP-OMP.cpp | 2 +- src/basic/INDEXLIST_3LOOP-OMPTarget.cpp | 2 +- src/basic/INDEXLIST_3LOOP-Seq.cpp | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/algorithm/SCAN-Cuda.cpp b/src/algorithm/SCAN-Cuda.cpp index e6819884f..b19e5a617 100644 --- a/src/algorithm/SCAN-Cuda.cpp +++ b/src/algorithm/SCAN-Cuda.cpp @@ -40,7 +40,7 @@ namespace algorithm deallocCudaDeviceData(y); -void SCAN::runCudaVariant(VariantID vid, size_t /*tune_idx*/) +void SCAN::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/algorithm/SCAN-Hip.cpp b/src/algorithm/SCAN-Hip.cpp index 217f60a86..57fe49bf5 100644 --- a/src/algorithm/SCAN-Hip.cpp +++ b/src/algorithm/SCAN-Hip.cpp @@ -45,7 +45,7 @@ namespace algorithm deallocHipDeviceData(y); -void SCAN::runHipVariant(VariantID vid, size_t /*tune_idx*/) +void SCAN::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/algorithm/SCAN-OMP.cpp b/src/algorithm/SCAN-OMP.cpp index 82eec6006..59cccd851 100644 --- a/src/algorithm/SCAN-OMP.cpp +++ b/src/algorithm/SCAN-OMP.cpp @@ -18,7 +18,7 @@ namespace rajaperf namespace algorithm { -void SCAN::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void SCAN::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/algorithm/SCAN-OMPTarget.cpp b/src/algorithm/SCAN-OMPTarget.cpp index 0f453cac0..67cec46bc 100644 --- a/src/algorithm/SCAN-OMPTarget.cpp +++ b/src/algorithm/SCAN-OMPTarget.cpp @@ -36,7 +36,7 @@ namespace algorithm deallocOpenMPDeviceData(y, did); -void SCAN::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void SCAN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) \ && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) diff --git a/src/algorithm/SCAN-Seq.cpp b/src/algorithm/SCAN-Seq.cpp index bc7769360..f1ebfd69c 100644 --- a/src/algorithm/SCAN-Seq.cpp +++ b/src/algorithm/SCAN-Seq.cpp @@ -18,7 +18,7 @@ namespace algorithm { -void SCAN::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void SCAN::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/INDEXLIST-OMP.cpp b/src/basic/INDEXLIST-OMP.cpp index a1fe5888a..e1f0fab1c 100644 --- a/src/basic/INDEXLIST-OMP.cpp +++ b/src/basic/INDEXLIST-OMP.cpp @@ -17,7 +17,7 @@ namespace rajaperf namespace basic { -void INDEXLIST::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void INDEXLIST::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/INDEXLIST-OMPTarget.cpp b/src/basic/INDEXLIST-OMPTarget.cpp index b15c46602..1ba024caf 100644 --- a/src/basic/INDEXLIST-OMPTarget.cpp +++ b/src/basic/INDEXLIST-OMPTarget.cpp @@ -38,7 +38,7 @@ namespace basic #endif -void INDEXLIST::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void INDEXLIST::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) \ && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) diff --git a/src/basic/INDEXLIST-Seq.cpp b/src/basic/INDEXLIST-Seq.cpp index c9d9e6369..f34dd3e23 100644 --- a/src/basic/INDEXLIST-Seq.cpp +++ b/src/basic/INDEXLIST-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void INDEXLIST::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void INDEXLIST::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/INDEXLIST_3LOOP-OMP.cpp b/src/basic/INDEXLIST_3LOOP-OMP.cpp index b8f223b0d..59f1a6b84 100644 --- a/src/basic/INDEXLIST_3LOOP-OMP.cpp +++ b/src/basic/INDEXLIST_3LOOP-OMP.cpp @@ -24,7 +24,7 @@ namespace basic delete[] counts; counts = nullptr; -void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid, size_t /*tune_idx*/) +void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp b/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp index 7c6ff734a..5520ab21e 100644 --- a/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp +++ b/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp @@ -38,7 +38,7 @@ namespace basic deallocOpenMPDeviceData(list, did); -void INDEXLIST_3LOOP::runOpenMPTargetVariant(VariantID vid, size_t /*tune_idx*/) +void INDEXLIST_3LOOP::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) \ && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) diff --git a/src/basic/INDEXLIST_3LOOP-Seq.cpp b/src/basic/INDEXLIST_3LOOP-Seq.cpp index f6505bf89..264597e02 100644 --- a/src/basic/INDEXLIST_3LOOP-Seq.cpp +++ b/src/basic/INDEXLIST_3LOOP-Seq.cpp @@ -25,7 +25,7 @@ namespace basic -void INDEXLIST_3LOOP::runSeqVariant(VariantID vid, size_t /*tune_idx*/) +void INDEXLIST_3LOOP::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; From 226430a4276f0409f10e05ab475e59d40d9f0234 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 4 Apr 2022 14:18:50 -0700 Subject: [PATCH 304/392] Use default_gpu_block_size in scan --- src/algorithm/SCAN-Cuda.cpp | 8 +------- src/algorithm/SCAN-Hip.cpp | 8 +------- src/algorithm/SCAN.hpp | 2 ++ 3 files changed, 4 insertions(+), 14 deletions(-) diff --git a/src/algorithm/SCAN-Cuda.cpp b/src/algorithm/SCAN-Cuda.cpp index b19e5a617..8911793c0 100644 --- a/src/algorithm/SCAN-Cuda.cpp +++ b/src/algorithm/SCAN-Cuda.cpp @@ -24,12 +24,6 @@ namespace rajaperf namespace algorithm { - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - #define SCAN_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(x, m_x, iend); \ allocAndInitCudaDeviceData(y, m_y, iend); @@ -104,7 +98,7 @@ void SCAN::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::exclusive_scan< RAJA::cuda_exec >(RAJA_SCAN_ARGS); + RAJA::exclusive_scan< RAJA::cuda_exec >(RAJA_SCAN_ARGS); } stopTimer(); diff --git a/src/algorithm/SCAN-Hip.cpp b/src/algorithm/SCAN-Hip.cpp index 57fe49bf5..a105d9dd1 100644 --- a/src/algorithm/SCAN-Hip.cpp +++ b/src/algorithm/SCAN-Hip.cpp @@ -29,12 +29,6 @@ namespace rajaperf namespace algorithm { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - #define SCAN_DATA_SETUP_HIP \ allocAndInitHipDeviceData(x, m_x, iend); \ allocAndInitHipDeviceData(y, m_y, iend); @@ -131,7 +125,7 @@ void SCAN::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::exclusive_scan< RAJA::hip_exec >(RAJA_SCAN_ARGS); + RAJA::exclusive_scan< RAJA::hip_exec >(RAJA_SCAN_ARGS); } stopTimer(); diff --git a/src/algorithm/SCAN.hpp b/src/algorithm/SCAN.hpp index fe54e1673..edb148738 100644 --- a/src/algorithm/SCAN.hpp +++ b/src/algorithm/SCAN.hpp @@ -63,6 +63,8 @@ class SCAN : public KernelBase void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); private: + static const size_t default_gpu_block_size = 0; + Real_ptr m_x; Real_ptr m_y; }; From 810591cad73f42778d5cd937d0858b02c656f589 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 4 Apr 2022 14:22:10 -0700 Subject: [PATCH 305/392] Fixup UNUSED_ARG usage --- src/algorithm/SCAN.cpp | 4 ++-- src/basic/INDEXLIST.cpp | 4 ++-- src/basic/INDEXLIST_3LOOP.cpp | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index cda919ec2..05b4df049 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -62,7 +62,7 @@ SCAN::~SCAN() { } -void SCAN::setUp(VariantID vid, size_t /*tune_idx*/) +void SCAN::setUp(VariantID vid, RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataRandValue(m_x, getActualProblemSize(), vid); allocAndInitDataConst(m_y, getActualProblemSize(), 0.0, vid); @@ -73,7 +73,7 @@ void SCAN::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_y, getActualProblemSize(), checksum_scale_factor); } -void SCAN::tearDown(VariantID vid, size_t /*tune_idx*/) +void SCAN::tearDown(VariantID vid, RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_x); diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index c5367487b..3d3e0de6b 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -55,7 +55,7 @@ INDEXLIST::~INDEXLIST() { } -void INDEXLIST::setUp(VariantID vid, size_t /*tune_idx*/) +void INDEXLIST::setUp(VariantID vid, RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataRandSign(m_x, getActualProblemSize(), vid); allocAndInitData(m_list, getActualProblemSize(), vid); @@ -68,7 +68,7 @@ void INDEXLIST::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += Checksum_type(m_len); } -void INDEXLIST::tearDown(VariantID vid, size_t /*tune_idx*/) +void INDEXLIST::tearDown(VariantID vid, RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_x); diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp index 213c936aa..622bf2463 100644 --- a/src/basic/INDEXLIST_3LOOP.cpp +++ b/src/basic/INDEXLIST_3LOOP.cpp @@ -64,7 +64,7 @@ INDEXLIST_3LOOP::~INDEXLIST_3LOOP() { } -void INDEXLIST_3LOOP::setUp(VariantID vid, size_t tune_idx) +void INDEXLIST_3LOOP::setUp(VariantID vid, RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataRandSign(m_x, getActualProblemSize(), vid); allocAndInitData(m_list, getActualProblemSize(), vid); @@ -77,7 +77,7 @@ void INDEXLIST_3LOOP::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += Checksum_type(m_len); } -void INDEXLIST_3LOOP::tearDown(VariantID vid, size_t tune_idx) +void INDEXLIST_3LOOP::tearDown(VariantID vid, RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_x); From 446e564d4b0a5690472847e42309a9ab8e8dc8a9 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 4 Apr 2022 14:29:01 -0700 Subject: [PATCH 306/392] Avoid warnings with openmp target --- src/algorithm/SCAN-OMPTarget.cpp | 5 +++++ src/basic/INDEXLIST-OMPTarget.cpp | 2 ++ src/basic/INDEXLIST_3LOOP-OMPTarget.cpp | 5 +++++ 3 files changed, 12 insertions(+) diff --git a/src/algorithm/SCAN-OMPTarget.cpp b/src/algorithm/SCAN-OMPTarget.cpp index 67cec46bc..8d452bb12 100644 --- a/src/algorithm/SCAN-OMPTarget.cpp +++ b/src/algorithm/SCAN-OMPTarget.cpp @@ -18,6 +18,9 @@ namespace rajaperf namespace algorithm { +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) \ + && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) + // // Define threads per team for target execution // @@ -35,6 +38,8 @@ namespace algorithm deallocOpenMPDeviceData(x, did); \ deallocOpenMPDeviceData(y, did); +#endif + void SCAN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { diff --git a/src/basic/INDEXLIST-OMPTarget.cpp b/src/basic/INDEXLIST-OMPTarget.cpp index 1ba024caf..7b99c1bb9 100644 --- a/src/basic/INDEXLIST-OMPTarget.cpp +++ b/src/basic/INDEXLIST-OMPTarget.cpp @@ -19,6 +19,7 @@ namespace basic #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) \ && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) + // // Define threads per team for target execution // @@ -35,6 +36,7 @@ namespace basic getOpenMPDeviceData(m_list, list, iend, hid, did); \ deallocOpenMPDeviceData(x, did); \ deallocOpenMPDeviceData(list, did); + #endif diff --git a/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp b/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp index 5520ab21e..04696a50d 100644 --- a/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp +++ b/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp @@ -17,6 +17,9 @@ namespace rajaperf namespace basic { +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) \ + && _OPENMP >= 201811 && defined(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN) + // // Define threads per team for target execution // @@ -37,6 +40,8 @@ namespace basic deallocOpenMPDeviceData(x, did); \ deallocOpenMPDeviceData(list, did); +#endif + void INDEXLIST_3LOOP::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { From c8a537d2b1825e87478b23d965b0ed4bf7c4ee84 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 4 Apr 2022 16:29:59 -0700 Subject: [PATCH 307/392] Fix unused args in scan kernels --- src/algorithm/SCAN.cpp | 4 ++-- src/basic/INDEXLIST.cpp | 4 ++-- src/basic/INDEXLIST_3LOOP.cpp | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index 05b4df049..f849d68c5 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -62,7 +62,7 @@ SCAN::~SCAN() { } -void SCAN::setUp(VariantID vid, RAJAPERF_UNUSED_ARG(tune_idx)) +void SCAN::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataRandValue(m_x, getActualProblemSize(), vid); allocAndInitDataConst(m_y, getActualProblemSize(), 0.0, vid); @@ -73,7 +73,7 @@ void SCAN::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += calcChecksum(m_y, getActualProblemSize(), checksum_scale_factor); } -void SCAN::tearDown(VariantID vid, RAJAPERF_UNUSED_ARG(tune_idx)) +void SCAN::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_x); diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index 3d3e0de6b..de8bb73fd 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -55,7 +55,7 @@ INDEXLIST::~INDEXLIST() { } -void INDEXLIST::setUp(VariantID vid, RAJAPERF_UNUSED_ARG(tune_idx)) +void INDEXLIST::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataRandSign(m_x, getActualProblemSize(), vid); allocAndInitData(m_list, getActualProblemSize(), vid); @@ -68,7 +68,7 @@ void INDEXLIST::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += Checksum_type(m_len); } -void INDEXLIST::tearDown(VariantID vid, RAJAPERF_UNUSED_ARG(tune_idx)) +void INDEXLIST::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_x); diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp index 622bf2463..288dfc54e 100644 --- a/src/basic/INDEXLIST_3LOOP.cpp +++ b/src/basic/INDEXLIST_3LOOP.cpp @@ -64,7 +64,7 @@ INDEXLIST_3LOOP::~INDEXLIST_3LOOP() { } -void INDEXLIST_3LOOP::setUp(VariantID vid, RAJAPERF_UNUSED_ARG(tune_idx)) +void INDEXLIST_3LOOP::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataRandSign(m_x, getActualProblemSize(), vid); allocAndInitData(m_list, getActualProblemSize(), vid); @@ -77,7 +77,7 @@ void INDEXLIST_3LOOP::updateChecksum(VariantID vid, size_t tune_idx) checksum[vid][tune_idx] += Checksum_type(m_len); } -void INDEXLIST_3LOOP::tearDown(VariantID vid, RAJAPERF_UNUSED_ARG(tune_idx)) +void INDEXLIST_3LOOP::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_x); From 717ca2f93393155651dbd892aa0e9ceefcce22b6 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 4 Apr 2022 16:34:17 -0700 Subject: [PATCH 308/392] Fix warnings in INDEXLIST_3LOOP --- src/basic/INDEXLIST_3LOOP-Cuda.cpp | 3 +-- src/basic/INDEXLIST_3LOOP-Hip.cpp | 3 +-- src/basic/INDEXLIST_3LOOP-OMP.cpp | 2 ++ 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/basic/INDEXLIST_3LOOP-Cuda.cpp b/src/basic/INDEXLIST_3LOOP-Cuda.cpp index 40b6db128..142209727 100644 --- a/src/basic/INDEXLIST_3LOOP-Cuda.cpp +++ b/src/basic/INDEXLIST_3LOOP-Cuda.cpp @@ -37,7 +37,6 @@ namespace basic template < size_t block_size > __launch_bounds__(block_size) __global__ void indexlist_conditional(Real_ptr x, - Int_ptr list, Index_type* counts, Index_type iend) { @@ -105,7 +104,7 @@ void INDEXLIST_3LOOP::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); indexlist_conditional<<>>( - x, list, counts, iend ); + x, counts, iend ); cudaErrchk( cudaGetLastError() ); cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, diff --git a/src/basic/INDEXLIST_3LOOP-Hip.cpp b/src/basic/INDEXLIST_3LOOP-Hip.cpp index 42e3211fd..3fd4e8f37 100644 --- a/src/basic/INDEXLIST_3LOOP-Hip.cpp +++ b/src/basic/INDEXLIST_3LOOP-Hip.cpp @@ -37,7 +37,6 @@ namespace basic template < size_t block_size > __launch_bounds__(block_size) __global__ void indexlist_conditional(Real_ptr x, - Int_ptr list, Index_type* counts, Index_type iend) { @@ -116,7 +115,7 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); hipLaunchKernelGGL((indexlist_conditional), grid_size, block_size, 0, stream, - x, list, counts, iend ); + x, counts, iend ); hipErrchk( hipGetLastError() ); #if defined(__HIPCC__) diff --git a/src/basic/INDEXLIST_3LOOP-OMP.cpp b/src/basic/INDEXLIST_3LOOP-OMP.cpp index 59f1a6b84..75ad07465 100644 --- a/src/basic/INDEXLIST_3LOOP-OMP.cpp +++ b/src/basic/INDEXLIST_3LOOP-OMP.cpp @@ -239,6 +239,8 @@ void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } +#else + RAJA_UNUSED_VAR(vid); #endif } From 89d3f9b12bdb1718019fbb1a1d5f2ebd55264c1a Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Mon, 4 Apr 2022 22:20:13 -0500 Subject: [PATCH 309/392] adding blocksize tuning infrastructure to REDUCE_STRUCT test --- src/basic/REDUCE_STRUCT-Cuda.cpp | 18 +++++++----------- src/basic/REDUCE_STRUCT-Hip.cpp | 15 ++++++--------- src/basic/REDUCE_STRUCT.cpp | 18 +++++++++--------- src/basic/REDUCE_STRUCT.hpp | 19 +++++++++++++------ 4 files changed, 35 insertions(+), 35 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index acb2b302d..a460c4a0c 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -21,12 +21,7 @@ namespace rajaperf namespace basic { - // - // Define thread block size for Cuda execution - // - const size_t block_size = 256; - - + #define REDUCE_STRUCT_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(particles.x, m_x, particles.N); \ allocAndInitCudaDeviceData(particles.y, m_y, particles.N); \ @@ -35,7 +30,8 @@ namespace basic #define REDUCE_STRUCT_DATA_TEARDOWN_CUDA \ deallocCudaDeviceData(particles.x); \ deallocCudaDeviceData(particles.y); \ - +template < size_t block_size > +__launch_bounds__(block_size) __global__ void reduce_struct(Real_ptr x, Real_ptr y, Real_ptr xsum, Real_ptr xmin, Real_ptr xmax, Real_ptr ysum, Real_ptr ymin, Real_ptr ymax, @@ -104,7 +100,7 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, } } - +template < size_t block_size > void REDUCE_STRUCT::runCudaVariant(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -123,7 +119,7 @@ void REDUCE_STRUCT::runCudaVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { cudaErrchk(cudaMemsetAsync(mem, 0.0, 6*sizeof(Real_type))); - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(particles.N, block_size); + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); reduce_struct<<>> (particles.x, particles.y, @@ -175,10 +171,10 @@ void REDUCE_STRUCT::runCudaVariant(VariantID vid) REDUCE_STRUCT_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n REDUCE_STRUCT : Unknown CUDA variant id = " << vid << std::endl; + getCout() << "\n REDUCE_STRUCT : Unknown CUDA variant id = " << vid << std::endl; } } - +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(REDUCE_STRUCT, Cuda) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index 15d890b1b..8afb201ff 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -21,11 +21,6 @@ namespace rajaperf namespace basic { - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - #define REDUCE_STRUCT_DATA_SETUP_HIP \ allocAndInitHipDeviceData(particles.x, m_x, particles.N); \ @@ -34,7 +29,8 @@ namespace basic #define REDUCE_STRUCT_DATA_TEARDOWN_HIP \ deallocHipDeviceData(particles.x); \ deallocHipDeviceData(particles.y); \ - +template < size_t block_size > +__launch_bounds__(block_size) __global__ void reduce_struct(Real_ptr x, Real_ptr y, Real_ptr xsum, Real_ptr xmin, Real_ptr xmax, Real_ptr ysum, Real_ptr ymin, Real_ptr ymax, @@ -104,6 +100,7 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, } +template < size_t block_size > void REDUCE_STRUCT::runHipVariant(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -123,7 +120,7 @@ void REDUCE_STRUCT::runHipVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { hipErrchk(hipMemsetAsync(mem, 0.0, 6*sizeof(Real_type))); - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(particles.N, block_size); + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); hipLaunchKernelGGL((reduce_struct), dim3(grid_size), dim3(block_size), 6*sizeof(Real_type)*block_size, 0, particles.x, particles.y, mem, mem+1,mem+2, //xcenter,xmin,xmax @@ -176,10 +173,10 @@ void REDUCE_STRUCT::runHipVariant(VariantID vid) REDUCE_STRUCT_DATA_TEARDOWN_HIP; } else { - std::cout << "\n REDUCE_STRUCT : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n REDUCE_STRUCT : Unknown Hip variant id = " << vid << std::endl; } } - + RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(REDUCE_STRUCT, Hip) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp index f38458c92..b4c3906e5 100644 --- a/src/basic/REDUCE_STRUCT.cpp +++ b/src/basic/REDUCE_STRUCT.cpp @@ -62,7 +62,7 @@ REDUCE_STRUCT::~REDUCE_STRUCT() { } -void REDUCE_STRUCT::setUp(VariantID vid) +void REDUCE_STRUCT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitData(m_x, getActualProblemSize(), vid); allocAndInitData(m_y, getActualProblemSize(), vid); @@ -74,19 +74,19 @@ void REDUCE_STRUCT::setUp(VariantID vid) } } -void REDUCE_STRUCT::updateChecksum(VariantID vid) +void REDUCE_STRUCT::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid] += m_particles.GetCenter()[0]; - checksum[vid] += m_particles.GetXMin(); - checksum[vid] += m_particles.GetXMax(); - checksum[vid] += m_particles.GetCenter()[1]; - checksum[vid] += m_particles.GetYMin(); - checksum[vid] += m_particles.GetYMax(); + checksum[vid][tune_idx] += m_particles.GetCenter()[0]; + checksum[vid][tune_idx] += m_particles.GetXMin(); + checksum[vid][tune_idx] += m_particles.GetXMax(); + checksum[vid][tune_idx] += m_particles.GetCenter()[1]; + checksum[vid][tune_idx] += m_particles.GetYMin(); + checksum[vid][tune_idx] += m_particles.GetYMax(); return; } -void REDUCE_STRUCT::tearDown(VariantID vid) +void REDUCE_STRUCT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; deallocData(m_x); diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index df065bf36..d230e8e2c 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -76,18 +76,25 @@ class REDUCE_STRUCT : public KernelBase ~REDUCE_STRUCT(); - void setUp(VariantID vid); - void updateChecksum(VariantID vid); - void tearDown(VariantID vid); + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid), size_t tune_idx; - void runSeqVariant(VariantID vid); - void runOpenMPVariant(VariantID vid); - void runCudaVariant(VariantID vid); + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + + template < size_t block_size > void runHipVariant(VariantID vid); + template < size_t block_size > void runOpenMPTargetVariant(VariantID vid); private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::make_list_type; struct particles_t{ Int_type N; Real_ptr x, y; From e3251da63e7b2d1027cf490e21c911a3697e91e7 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Tue, 5 Apr 2022 09:32:54 -0700 Subject: [PATCH 310/392] commit update Raja to avoid clang+cuda warning fix --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index bd6f2fbe8..dd9395777 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit bd6f2fbe8d94a8893d98ee0602c9e154969ed1f2 +Subproject commit dd93957771d34a8bfad09db5e1a856d40f0c7fcf From 1331bf4d841d996be3008c3ab8452b6ae16687d8 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 5 Apr 2022 12:26:15 -0700 Subject: [PATCH 311/392] Add REDUCE_SUM kernel This gives a single definitive reduction sum kernel. We can then compare the platform specific libraries like cub/rocprim, std library implementations, the perf suite redution pattern, and RAJA reducers. It also acts as an example of how to implement more kernel tunings than the gpu block size tunings. --- src/CMakeLists.txt | 3 + src/algorithm/CMakeLists.txt | 6 + src/algorithm/REDUCE_SUM-Cuda.cpp | 167 +++++++++++++++++++++ src/algorithm/REDUCE_SUM-Hip.cpp | 199 +++++++++++++++++++++++++ src/algorithm/REDUCE_SUM-OMP.cpp | 110 ++++++++++++++ src/algorithm/REDUCE_SUM-OMPTarget.cpp | 101 +++++++++++++ src/algorithm/REDUCE_SUM-Seq.cpp | 104 +++++++++++++ src/algorithm/REDUCE_SUM.cpp | 79 ++++++++++ src/algorithm/REDUCE_SUM.hpp | 81 ++++++++++ src/common/RAJAPerfSuite.cpp | 6 + src/common/RAJAPerfSuite.hpp | 1 + 11 files changed, 857 insertions(+) create mode 100644 src/algorithm/REDUCE_SUM-Cuda.cpp create mode 100644 src/algorithm/REDUCE_SUM-Hip.cpp create mode 100644 src/algorithm/REDUCE_SUM-OMP.cpp create mode 100644 src/algorithm/REDUCE_SUM-OMPTarget.cpp create mode 100644 src/algorithm/REDUCE_SUM-Seq.cpp create mode 100644 src/algorithm/REDUCE_SUM.cpp create mode 100644 src/algorithm/REDUCE_SUM.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 25306e9fe..941da75fa 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -207,6 +207,9 @@ blt_add_executable( algorithm/SORT-Seq.cpp algorithm/SORTPAIRS.cpp algorithm/SORTPAIRS-Seq.cpp + algorithm/REDUCE_SUM.cpp + algorithm/REDUCE_SUM-Seq.cpp + algorithm/REDUCE_SUM-OMPTarget.cpp DEPENDS_ON ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/algorithm/CMakeLists.txt b/src/algorithm/CMakeLists.txt index 07848730c..441e9c0ea 100644 --- a/src/algorithm/CMakeLists.txt +++ b/src/algorithm/CMakeLists.txt @@ -18,5 +18,11 @@ blt_add_library( SORTPAIRS-Hip.cpp SORTPAIRS-Cuda.cpp SORTPAIRS-OMP.cpp + REDUCE_SUM.cpp + REDUCE_SUM-Seq.cpp + REDUCE_SUM-Hip.cpp + REDUCE_SUM-Cuda.cpp + REDUCE_SUM-OMP.cpp + REDUCE_SUM-OMPTarget.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp new file mode 100644 index 000000000..ac1758814 --- /dev/null +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -0,0 +1,167 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "REDUCE_SUM.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include "cub/device/device_reduce.cuh" +#include "cub/util_allocator.cuh" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + +#define REDUCE_SUM_DATA_SETUP_CUDA \ + allocAndInitCudaDeviceData(x, m_x, iend); + +#define REDUCE_SUM_DATA_TEARDOWN_CUDA \ + deallocCudaDeviceData(x); + + +template < size_t block_size > +void REDUCE_SUM::runCudaVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + REDUCE_SUM_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + REDUCE_SUM_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sum(m_sum_init); + + RAJA::forall< RAJA::cuda_exec >( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE_SUM_BODY; + }); + + m_sum = sum.get(); + + } + stopTimer(); + + REDUCE_SUM_DATA_TEARDOWN_CUDA; + + } else { + getCout() << "\n REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl; + } +} + +void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + REDUCE_SUM_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + REDUCE_SUM_DATA_SETUP_CUDA; + + cudaStream_t stream = 0; + + int len = iend - ibegin; + + Real_type* sum_storage; + allocCudaPinnedData(sum_storage, 1); + + // Determine temporary device storage requirements + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cudaErrchk(::cub::DeviceReduce::Reduce(d_temp_storage, + temp_storage_bytes, + x+ibegin, + sum_storage, + len, + ::cub::Sum(), + m_sum_init, + stream)); + + // Allocate temporary storage + unsigned char* temp_storage; + allocCudaDeviceData(temp_storage, temp_storage_bytes); + d_temp_storage = temp_storage; + + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + // Run + cudaErrchk(::cub::DeviceReduce::Reduce(d_temp_storage, + temp_storage_bytes, + x+ibegin, + sum_storage, + len, + ::cub::Sum(), + m_sum_init, + stream)); + + cudaErrchk(cudaStreamSynchronize(stream)); + m_sum = *sum_storage; + + } + stopTimer(); + + // Free temporary storage + deallocCudaDeviceData(temp_storage); + deallocCudaPinnedData(sum_storage); + + REDUCE_SUM_DATA_TEARDOWN_CUDA; + + } else if ( vid == RAJA_CUDA ) { + + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tune_idx == t) { + runCudaVariantImpl(vid); + } + t += 1; + } + }); + + } else { + getCout() << "\n REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl; + } + +} + +void REDUCE_SUM::setCudaTuningDefinitions(VariantID vid) +{ + if ( vid == Base_CUDA ) { + addVariantTuningName(vid, "cub"); + } else if ( vid == RAJA_CUDA ) { + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); + } +} + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp new file mode 100644 index 000000000..83c205c1c --- /dev/null +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -0,0 +1,199 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "REDUCE_SUM.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#if defined(__HIPCC__) +#define ROCPRIM_HIP_API 1 +#include "rocprim/device/device_reduce.hpp" +#elif defined(__CUDACC__) +#include "cub/device/device_reduce.cuh" +#include "cub/util_allocator.cuh" +#endif + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + +#define REDUCE_SUM_DATA_SETUP_HIP \ + allocAndInitHipDeviceData(x, m_x, iend); + +#define REDUCE_SUM_DATA_TEARDOWN_HIP \ + deallocHipDeviceData(x); + + +template < size_t block_size > +void REDUCE_SUM::runHipVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + REDUCE_SUM_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + REDUCE_SUM_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sum(m_sum_init); + + RAJA::forall< RAJA::hip_exec >( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE_SUM_BODY; + }); + + m_sum = sum.get(); + + } + stopTimer(); + + REDUCE_SUM_DATA_TEARDOWN_HIP; + + } else { + getCout() << "\n REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl; + } +} + +void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + REDUCE_SUM_DATA_SETUP; + + if ( vid == Base_HIP ) { + + REDUCE_SUM_DATA_SETUP_HIP; + + hipStream_t stream = 0; + + int len = iend - ibegin; + + Real_type* sum_storage; + allocHipPinnedData(sum_storage, 1); + + // Determine temporary device storage requirements + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; +#if defined(__HIPCC__) + hipErrchk(::rocprim::reduce(d_temp_storage, + temp_storage_bytes, + x+ibegin, + sum_storage, + m_sum_init, + len, + rocprim::plus(), + stream)); +#elif defined(__CUDACC__) + hipErrchk(::cub::DeviceReduce::Reduce(d_temp_storage, + temp_storage_bytes, + x+ibegin, + sum_storage, + len, + ::cub::Sum(), + m_sum_init, + stream)); +#endif + + // Allocate temporary storage + unsigned char* temp_storage; + allocHipDeviceData(temp_storage, temp_storage_bytes); + d_temp_storage = temp_storage; + + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + // Run +#if defined(__HIPCC__) + hipErrchk(::rocprim::reduce(d_temp_storage, + temp_storage_bytes, + x+ibegin, + sum_storage, + m_sum_init, + len, + rocprim::plus(), + stream)); +#elif defined(__CUDACC__) + hipErrchk(::cub::DeviceReduce::Reduce(d_temp_storage, + temp_storage_bytes, + x+ibegin, + sum_storage, + len, + ::cub::Sum(), + m_sum_init, + stream)); +#endif + + hipErrchk(hipStreamSynchronize(stream)); + m_sum = *sum_storage; + + } + stopTimer(); + + // Free temporary storage + deallocHipDeviceData(temp_storage); + deallocHipPinnedData(sum_storage); + + REDUCE_SUM_DATA_TEARDOWN_HIP; + + + } else if ( vid == RAJA_HIP ) { + + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tune_idx == t) { + runHipVariantImpl(vid); + } + t += 1; + } + }); + + } else { + getCout() << "\n REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl; + } + +} + +void REDUCE_SUM::setHipTuningDefinitions(VariantID vid) +{ + if ( vid == Base_HIP ) { +#if defined(__HIPCC__) + addVariantTuningName(vid, "rocprim"); +#elif defined(__CUDACC__) + addVariantTuningName(vid, "cub"); +#endif + } else if ( vid == RAJA_HIP ) { + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); + } +} + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/algorithm/REDUCE_SUM-OMP.cpp b/src/algorithm/REDUCE_SUM-OMP.cpp new file mode 100644 index 000000000..8f80b5633 --- /dev/null +++ b/src/algorithm/REDUCE_SUM-OMP.cpp @@ -0,0 +1,110 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "REDUCE_SUM.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + + +void REDUCE_SUM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + REDUCE_SUM_DATA_SETUP; + + switch ( vid ) { + + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type sum = m_sum_init; + + #pragma omp parallel for reduction(+:sum) + for (Index_type i = ibegin; i < iend; ++i ) { + REDUCE_SUM_BODY; + } + + m_sum = sum; + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP : { + + auto sumreduce_base_lam = [=](Index_type i) { + return x[i]; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type sum = m_sum_init; + + #pragma omp parallel for reduction(+:sum) + for (Index_type i = ibegin; i < iend; ++i ) { + sum += sumreduce_base_lam(i); + } + + m_sum = sum; + + } + stopTimer(); + + break; + } + + case RAJA_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sum(m_sum_init); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + REDUCE_SUM_BODY; + }); + + m_sum = sum.get(); + + } + stopTimer(); + + break; + } + + default : { + getCout() << "\n REDUCE_SUM : Unknown variant id = " << vid << std::endl; + } + + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +} // end namespace algorithm +} // end namespace rajaperf diff --git a/src/algorithm/REDUCE_SUM-OMPTarget.cpp b/src/algorithm/REDUCE_SUM-OMPTarget.cpp new file mode 100644 index 000000000..b3bf8ac05 --- /dev/null +++ b/src/algorithm/REDUCE_SUM-OMPTarget.cpp @@ -0,0 +1,101 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "REDUCE_SUM.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + +#define REDUCE_SUM_DATA_SETUP_OMP_TARGET \ + int hid = omp_get_initial_device(); \ + int did = omp_get_default_device(); \ +\ + allocAndInitOpenMPDeviceData(x, m_x, iend, did, hid); + +#define REDUCE_SUM_DATA_TEARDOWN_OMP_TARGET \ + deallocOpenMPDeviceData(x, did); \ + + +void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + REDUCE_SUM_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + REDUCE_SUM_DATA_SETUP_OMP_TARGET + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type sum = m_sum_init; + + #pragma omp target is_device_ptr(x) device( did ) map(tofrom:sum) + #pragma omp teams distribute parallel for reduction(+:sum) \ + thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { + REDUCE_SUM_BODY; + } + + m_sum = sum; + + } + stopTimer(); + + REDUCE_SUM_DATA_TEARDOWN_OMP_TARGET + + } else if ( vid == RAJA_OpenMPTarget ) { + + REDUCE_SUM_DATA_SETUP_OMP_TARGET + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sum(m_sum_init); + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + REDUCE_SUM_BODY; + }); + + m_sum = sum.get(); + + } + stopTimer(); + + REDUCE_SUM_DATA_TEARDOWN_OMP_TARGET + + } else { + getCout() << "\n REDUCE_SUM : Unknown OMP Target variant id = " << vid << std::endl; + } + +} + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/algorithm/REDUCE_SUM-Seq.cpp b/src/algorithm/REDUCE_SUM-Seq.cpp new file mode 100644 index 000000000..d4fc7cddf --- /dev/null +++ b/src/algorithm/REDUCE_SUM-Seq.cpp @@ -0,0 +1,104 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "REDUCE_SUM.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + + +void REDUCE_SUM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + REDUCE_SUM_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type sum = m_sum_init; + + for (Index_type i = ibegin; i < iend; ++i ) { + REDUCE_SUM_BODY; + } + + m_sum = sum; + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto reduce_sum_base_lam = [=](Index_type i) { + return x[i]; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type sum = m_sum_init; + + for (Index_type i = ibegin; i < iend; ++i ) { + sum += reduce_sum_base_lam(i); + } + + m_sum = sum; + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sum(m_sum_init); + + RAJA::forall( RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + REDUCE_SUM_BODY; + }); + + m_sum = sum.get(); + + } + stopTimer(); + + break; + } +#endif + + default : { + getCout() << "\n REDUCE_SUM : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace algorithm +} // end namespace rajaperf diff --git a/src/algorithm/REDUCE_SUM.cpp b/src/algorithm/REDUCE_SUM.cpp new file mode 100644 index 000000000..f85b982f6 --- /dev/null +++ b/src/algorithm/REDUCE_SUM.cpp @@ -0,0 +1,79 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "REDUCE_SUM.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/DataUtils.hpp" + +namespace rajaperf +{ +namespace algorithm +{ + + +REDUCE_SUM::REDUCE_SUM(const RunParams& params) + : KernelBase(rajaperf::Algorithm_REDUCE_SUM, params) +{ + setDefaultProblemSize(1000000); + setDefaultReps(50); + + setActualProblemSize( getTargetProblemSize() ); + + setItsPerRep( getActualProblemSize() ); + setKernelsPerRep(1); + setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) + + (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); + setFLOPsPerRep(getActualProblemSize()); + + setUsesFeature(Forall); + setUsesFeature(Reduction); + + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( Base_OpenMPTarget ); + setVariantDefined( RAJA_OpenMPTarget ); + + setVariantDefined( Base_CUDA ); + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( Base_HIP ); + setVariantDefined( RAJA_HIP ); +} + +REDUCE_SUM::~REDUCE_SUM() +{ +} + +void REDUCE_SUM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + allocAndInitData(m_x, getActualProblemSize(), vid); + m_sum_init = 0.0; + m_sum = 0.0; +} + +void REDUCE_SUM::updateChecksum(VariantID vid, size_t tune_idx) +{ + checksum[vid].at(tune_idx) += calcChecksum(&m_sum, 1); +} + +void REDUCE_SUM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + (void) vid; + deallocData(m_x); +} + +} // end namespace algorithm +} // end namespace rajaperf diff --git a/src/algorithm/REDUCE_SUM.hpp b/src/algorithm/REDUCE_SUM.hpp new file mode 100644 index 000000000..8d395201e --- /dev/null +++ b/src/algorithm/REDUCE_SUM.hpp @@ -0,0 +1,81 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// REDUCE_SUM kernel reference implementation: +/// +/// Real_type sum = std::reduce(x+ibegin, x+iend); +/// // or +/// Real_type sum = std::accumulate(x+ibegin, x+iend, 0.0); +/// // or +/// Real_type sum = 0.0; +/// for (Index_type i = ibegin; i < iend; ++i ) { +/// sum += x[i] ; +/// } +/// + +#ifndef RAJAPerf_Algorithm_REDUCE_SUM_HPP +#define RAJAPerf_Algorithm_REDUCE_SUM_HPP + +#define REDUCE_SUM_DATA_SETUP \ + Real_ptr x = m_x; + +#define REDUCE_SUM_STD_ARGS \ + x + ibegin, x + iend + +#define REDUCE_SUM_BODY \ + sum += x[i]; + + +#include "common/KernelBase.hpp" + +namespace rajaperf +{ +class RunParams; + +namespace algorithm +{ + +class REDUCE_SUM : public KernelBase +{ +public: + + REDUCE_SUM(const RunParams& params); + + ~REDUCE_SUM(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + +private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::make_list_type; + + Real_ptr m_x; + Real_type m_sum_init; + Real_type m_sum; +}; + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 7d1f53eaf..1e8a0112b 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -94,6 +94,7 @@ // #include "algorithm/SORT.hpp" #include "algorithm/SORTPAIRS.hpp" +#include "algorithm/REDUCE_SUM.hpp" #include @@ -222,6 +223,7 @@ static const std::string KernelNames [] = // std::string("Algorithm_SORT"), std::string("Algorithm_SORTPAIRS"), + std::string("Algorithm_REDUCE_SUM"), std::string("Unknown Kernel") // Keep this at the end and DO NOT remove.... @@ -741,6 +743,10 @@ KernelBase* getKernelObject(KernelID kid, kernel = new algorithm::SORTPAIRS(run_params); break; } + case Algorithm_REDUCE_SUM: { + kernel = new algorithm::REDUCE_SUM(run_params); + break; + } default: { getCout() << "\n Unknown Kernel ID = " << kid << std::endl; diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index fb0bd1765..71a250406 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -147,6 +147,7 @@ enum KernelID { // Algorithm_SORT, Algorithm_SORTPAIRS, + Algorithm_REDUCE_SUM, NumKernels // Keep this one last and NEVER comment out (!!) From 3756f1ce2019cfa7c6ffa3df61e867c9f5122bc7 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 5 Apr 2022 12:26:34 -0700 Subject: [PATCH 312/392] Improve speed_size script documentation --- scripts/sweep_size.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/sweep_size.sh b/scripts/sweep_size.sh index 770b4539d..65c3a94e5 100755 --- a/scripts/sweep_size.sh +++ b/scripts/sweep_size.sh @@ -8,18 +8,18 @@ SIZE_RATIO=2 ################################################################################ # # Usage: -# srun -n1 --exclusive sweep.sh -x raja-perf.exe [-- raja perf args] +# srun -n1 --exclusive sweep.sh -x raja-perf.exe [-- ] # # Parse any args for this script and consume them using shift # leave the raja perf arguments if any for later use # # Examples: -# lalloc 1 lrun -n1 sweep.sh -x raja-perf.exe -- args +# lalloc 1 lrun -n1 sweep.sh -x raja-perf.exe -- # # run a sweep of default problem sizes with executable `raja-perf.exe` # # with args `args` # # srun -n1 --exclusive sweep.sh -x raja-perf.exe --size-min 1000 -# --size-max 10000 --size-ratio 2 -- args +# --size-max 10000 --size-ratio 2 -- # # run a sweep of problem sizes 1K to 10K with ratio 2 (1K, 2K, 4K, 8K) # # with executable `raja-perf.exe` with args `args` # From d9ef90a46c35f73ec356b504c898cdc05822de7f Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 5 Apr 2022 13:53:47 -0700 Subject: [PATCH 313/392] Simplify REDUCE_SUN gpu implementations --- src/algorithm/REDUCE_SUM-Cuda.cpp | 86 +++++++++++++++++-------------- src/algorithm/REDUCE_SUM-Hip.cpp | 85 ++++++++++++++++-------------- src/algorithm/REDUCE_SUM.hpp | 6 ++- 3 files changed, 95 insertions(+), 82 deletions(-) diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index ac1758814..1e4b932e8 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -31,42 +31,7 @@ namespace algorithm deallocCudaDeviceData(x); -template < size_t block_size > -void REDUCE_SUM::runCudaVariantImpl(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - REDUCE_SUM_DATA_SETUP; - - if ( vid == RAJA_CUDA ) { - - REDUCE_SUM_DATA_SETUP_CUDA; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum sum(m_sum_init); - - RAJA::forall< RAJA::cuda_exec >( - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE_SUM_BODY; - }); - - m_sum = sum.get(); - - } - stopTimer(); - - REDUCE_SUM_DATA_TEARDOWN_CUDA; - - } else { - getCout() << "\n REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl; - } -} - -void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx) +void REDUCE_SUM::runCudaVariantCub(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -128,23 +93,64 @@ void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx) REDUCE_SUM_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { + } else { + getCout() << "\n REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl; + } +} +template < size_t block_size > +void REDUCE_SUM::runCudaVariantBlock(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + REDUCE_SUM_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + REDUCE_SUM_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sum(m_sum_init); + + RAJA::forall< RAJA::cuda_exec >( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE_SUM_BODY; + }); + + m_sum = sum.get(); + + } + stopTimer(); + + REDUCE_SUM_DATA_TEARDOWN_CUDA; + + } else { + getCout() << "\n REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl; + } +} + +void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx) +{ + if ( vid == Base_CUDA ) { + runCudaVariantCub(vid); + } else if ( vid == RAJA_CUDA ) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { if (tune_idx == t) { - runCudaVariantImpl(vid); + runCudaVariantBlock(vid); } t += 1; } }); - } else { getCout() << "\n REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl; } - } void REDUCE_SUM::setCudaTuningDefinitions(VariantID vid) diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index 83c205c1c..4315a3b9a 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -36,42 +36,7 @@ namespace algorithm deallocHipDeviceData(x); -template < size_t block_size > -void REDUCE_SUM::runHipVariantImpl(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - REDUCE_SUM_DATA_SETUP; - - if ( vid == RAJA_HIP ) { - - REDUCE_SUM_DATA_SETUP_HIP; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum sum(m_sum_init); - - RAJA::forall< RAJA::hip_exec >( - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE_SUM_BODY; - }); - - m_sum = sum.get(); - - } - stopTimer(); - - REDUCE_SUM_DATA_TEARDOWN_HIP; - - } else { - getCout() << "\n REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl; - } -} - -void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx) +void REDUCE_SUM::runHipVariantRocprim(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -155,24 +120,64 @@ void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx) REDUCE_SUM_DATA_TEARDOWN_HIP; + } else { + getCout() << "\n REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl; + } +} - } else if ( vid == RAJA_HIP ) { +template < size_t block_size > +void REDUCE_SUM::runHipVariantBlock(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + REDUCE_SUM_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + REDUCE_SUM_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sum(m_sum_init); + RAJA::forall< RAJA::hip_exec >( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE_SUM_BODY; + }); + + m_sum = sum.get(); + + } + stopTimer(); + + REDUCE_SUM_DATA_TEARDOWN_HIP; + + } else { + getCout() << "\n REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl; + } +} + +void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx) +{ + if ( vid == Base_HIP ) { + runHipVariantRocprim(vid); + } else if ( vid == RAJA_HIP ) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { if (tune_idx == t) { - runHipVariantImpl(vid); + runHipVariantBlock(vid); } t += 1; } }); - } else { getCout() << "\n REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl; } - } void REDUCE_SUM::setHipTuningDefinitions(VariantID vid) diff --git a/src/algorithm/REDUCE_SUM.hpp b/src/algorithm/REDUCE_SUM.hpp index 8d395201e..f6dba52db 100644 --- a/src/algorithm/REDUCE_SUM.hpp +++ b/src/algorithm/REDUCE_SUM.hpp @@ -61,10 +61,12 @@ class REDUCE_SUM : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void runCudaVariantCub(VariantID vid); + void runHipVariantRocprim(VariantID vid); template < size_t block_size > - void runCudaVariantImpl(VariantID vid); + void runCudaVariantBlock(VariantID vid); template < size_t block_size > - void runHipVariantImpl(VariantID vid); + void runHipVariantBlock(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 5b6bea4a11bd97d799c9a42ecbe9dd4b8a1aef06 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 5 Apr 2022 13:57:07 -0700 Subject: [PATCH 314/392] update copyright --- src/algorithm/SCAN-Cuda.cpp | 2 +- src/algorithm/SCAN-Hip.cpp | 2 +- src/algorithm/SCAN-OMP.cpp | 2 +- src/algorithm/SCAN-OMPTarget.cpp | 2 +- src/algorithm/SCAN-Seq.cpp | 2 +- src/algorithm/SCAN.cpp | 2 +- src/algorithm/SCAN.hpp | 2 +- src/basic/INDEXLIST-Cuda.cpp | 2 +- src/basic/INDEXLIST-Hip.cpp | 2 +- src/basic/INDEXLIST-OMP.cpp | 2 +- src/basic/INDEXLIST-OMPTarget.cpp | 2 +- src/basic/INDEXLIST-Seq.cpp | 2 +- src/basic/INDEXLIST.cpp | 2 +- src/basic/INDEXLIST.hpp | 2 +- src/basic/INDEXLIST_3LOOP-Cuda.cpp | 2 +- src/basic/INDEXLIST_3LOOP-Hip.cpp | 2 +- src/basic/INDEXLIST_3LOOP-OMP.cpp | 2 +- src/basic/INDEXLIST_3LOOP-OMPTarget.cpp | 2 +- src/basic/INDEXLIST_3LOOP-Seq.cpp | 2 +- src/basic/INDEXLIST_3LOOP.cpp | 2 +- src/basic/INDEXLIST_3LOOP.hpp | 2 +- 21 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/algorithm/SCAN-Cuda.cpp b/src/algorithm/SCAN-Cuda.cpp index 8911793c0..0f9612c23 100644 --- a/src/algorithm/SCAN-Cuda.cpp +++ b/src/algorithm/SCAN-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/COPYRIGHT file for details. // diff --git a/src/algorithm/SCAN-Hip.cpp b/src/algorithm/SCAN-Hip.cpp index a105d9dd1..6ddccb115 100644 --- a/src/algorithm/SCAN-Hip.cpp +++ b/src/algorithm/SCAN-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/COPYRIGHT file for details. // diff --git a/src/algorithm/SCAN-OMP.cpp b/src/algorithm/SCAN-OMP.cpp index 59cccd851..3d21e1e0b 100644 --- a/src/algorithm/SCAN-OMP.cpp +++ b/src/algorithm/SCAN-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/COPYRIGHT file for details. // diff --git a/src/algorithm/SCAN-OMPTarget.cpp b/src/algorithm/SCAN-OMPTarget.cpp index 8d452bb12..16a1fc1fc 100644 --- a/src/algorithm/SCAN-OMPTarget.cpp +++ b/src/algorithm/SCAN-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/COPYRIGHT file for details. // diff --git a/src/algorithm/SCAN-Seq.cpp b/src/algorithm/SCAN-Seq.cpp index f1ebfd69c..b658ca41d 100644 --- a/src/algorithm/SCAN-Seq.cpp +++ b/src/algorithm/SCAN-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/COPYRIGHT file for details. // diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index f849d68c5..07734ceb4 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/COPYRIGHT file for details. // diff --git a/src/algorithm/SCAN.hpp b/src/algorithm/SCAN.hpp index edb148738..519789a55 100644 --- a/src/algorithm/SCAN.hpp +++ b/src/algorithm/SCAN.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/COPYRIGHT file for details. // diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index f0eb60e96..22e5fdaaf 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/COPYRIGHT file for details. // diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp index 6875dc5c1..1450244e8 100644 --- a/src/basic/INDEXLIST-Hip.cpp +++ b/src/basic/INDEXLIST-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/COPYRIGHT file for details. // diff --git a/src/basic/INDEXLIST-OMP.cpp b/src/basic/INDEXLIST-OMP.cpp index e1f0fab1c..7f6ddbadb 100644 --- a/src/basic/INDEXLIST-OMP.cpp +++ b/src/basic/INDEXLIST-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/COPYRIGHT file for details. // diff --git a/src/basic/INDEXLIST-OMPTarget.cpp b/src/basic/INDEXLIST-OMPTarget.cpp index 7b99c1bb9..99f875b27 100644 --- a/src/basic/INDEXLIST-OMPTarget.cpp +++ b/src/basic/INDEXLIST-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/COPYRIGHT file for details. // diff --git a/src/basic/INDEXLIST-Seq.cpp b/src/basic/INDEXLIST-Seq.cpp index f34dd3e23..e7bb7139b 100644 --- a/src/basic/INDEXLIST-Seq.cpp +++ b/src/basic/INDEXLIST-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/COPYRIGHT file for details. // diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index de8bb73fd..df523fbf6 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/COPYRIGHT file for details. // diff --git a/src/basic/INDEXLIST.hpp b/src/basic/INDEXLIST.hpp index 30a60be97..0836d8197 100644 --- a/src/basic/INDEXLIST.hpp +++ b/src/basic/INDEXLIST.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/COPYRIGHT file for details. // diff --git a/src/basic/INDEXLIST_3LOOP-Cuda.cpp b/src/basic/INDEXLIST_3LOOP-Cuda.cpp index 142209727..22e263b4f 100644 --- a/src/basic/INDEXLIST_3LOOP-Cuda.cpp +++ b/src/basic/INDEXLIST_3LOOP-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/COPYRIGHT file for details. // diff --git a/src/basic/INDEXLIST_3LOOP-Hip.cpp b/src/basic/INDEXLIST_3LOOP-Hip.cpp index 3fd4e8f37..205b662dd 100644 --- a/src/basic/INDEXLIST_3LOOP-Hip.cpp +++ b/src/basic/INDEXLIST_3LOOP-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/COPYRIGHT file for details. // diff --git a/src/basic/INDEXLIST_3LOOP-OMP.cpp b/src/basic/INDEXLIST_3LOOP-OMP.cpp index 75ad07465..3ba12ea0a 100644 --- a/src/basic/INDEXLIST_3LOOP-OMP.cpp +++ b/src/basic/INDEXLIST_3LOOP-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/COPYRIGHT file for details. // diff --git a/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp b/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp index 04696a50d..d58dbe9e6 100644 --- a/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp +++ b/src/basic/INDEXLIST_3LOOP-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/COPYRIGHT file for details. // diff --git a/src/basic/INDEXLIST_3LOOP-Seq.cpp b/src/basic/INDEXLIST_3LOOP-Seq.cpp index 264597e02..14f62a8a7 100644 --- a/src/basic/INDEXLIST_3LOOP-Seq.cpp +++ b/src/basic/INDEXLIST_3LOOP-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/COPYRIGHT file for details. // diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp index 288dfc54e..e7d4215fa 100644 --- a/src/basic/INDEXLIST_3LOOP.cpp +++ b/src/basic/INDEXLIST_3LOOP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/COPYRIGHT file for details. // diff --git a/src/basic/INDEXLIST_3LOOP.hpp b/src/basic/INDEXLIST_3LOOP.hpp index e81783778..e19ee5508 100644 --- a/src/basic/INDEXLIST_3LOOP.hpp +++ b/src/basic/INDEXLIST_3LOOP.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/COPYRIGHT file for details. // From 90934dee19058c1f09646ba6af7139a3321171e5 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 5 Apr 2022 14:20:03 -0700 Subject: [PATCH 315/392] Add REDUCE_SUM base block implementations --- src/algorithm/REDUCE_SUM-Cuda.cpp | 88 ++++++++++++++++++++++++++++++- src/algorithm/REDUCE_SUM-Hip.cpp | 87 +++++++++++++++++++++++++++++- 2 files changed, 171 insertions(+), 4 deletions(-) diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index 1e4b932e8..a40bec0f1 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -30,6 +30,39 @@ namespace algorithm #define REDUCE_SUM_DATA_TEARDOWN_CUDA \ deallocCudaDeviceData(x); +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void reduce_sum(Real_ptr x, Real_ptr dsum, Real_type sum_init, + Index_type iend) +{ + extern __shared__ Real_type psum[ ]; + + Index_type i = blockIdx.x * block_size + threadIdx.x; + + psum[ threadIdx.x ] = sum_init; + for ( ; i < iend ; i += gridDim.x * block_size ) { + psum[ threadIdx.x ] += x[i]; + } + __syncthreads(); + + for ( i = block_size / 2; i > 0; i /= 2 ) { + if ( threadIdx.x < i ) { + psum[ threadIdx.x ] += psum[ threadIdx.x + i ]; + } + __syncthreads(); + } + +#if 1 // serialized access to shared data; + if ( threadIdx.x == 0 ) { + RAJA::atomicAdd( dsum, psum[ 0 ] ); + } +#else // this doesn't work due to data races + if ( threadIdx.x == 0 ) { + *dsum += psum[ 0 ]; + } +#endif +} + void REDUCE_SUM::runCudaVariantCub(VariantID vid) { @@ -107,7 +140,39 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) REDUCE_SUM_DATA_SETUP; - if ( vid == RAJA_CUDA ) { + if ( vid == Base_CUDA ) { + + REDUCE_SUM_DATA_SETUP_CUDA; + + Real_ptr dsum; + allocCudaDeviceData(dsum, 1); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + initCudaDeviceData(dsum, &m_sum_init, 1); + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + reduce_sum<<>>( x, + dsum, m_sum_init, + iend ); + cudaErrchk( cudaGetLastError() ); + + Real_type lsum; + Real_ptr plsum = &lsum; + getCudaDeviceData(plsum, dsum, 1); + + m_sum = lsum; + + } + stopTimer(); + + deallocCudaDeviceData(dsum); + + REDUCE_SUM_DATA_TEARDOWN_CUDA; + + } else if ( vid == RAJA_CUDA ) { REDUCE_SUM_DATA_SETUP_CUDA; @@ -136,7 +201,20 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx) { if ( vid == Base_CUDA ) { - runCudaVariantCub(vid); + size_t t = 0; + if (tune_idx == t) { + runCudaVariantCub(vid); + } + t += 1; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tune_idx == t) { + runCudaVariantBlock(vid); + } + t += 1; + } + }); } else if ( vid == RAJA_CUDA ) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { @@ -157,6 +235,12 @@ void REDUCE_SUM::setCudaTuningDefinitions(VariantID vid) { if ( vid == Base_CUDA ) { addVariantTuningName(vid, "cub"); + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } else if ( vid == RAJA_CUDA ) { seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index 4315a3b9a..c230e4dc8 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -35,6 +35,39 @@ namespace algorithm #define REDUCE_SUM_DATA_TEARDOWN_HIP \ deallocHipDeviceData(x); +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void reduce_sum(Real_ptr x, Real_ptr dsum, Real_type sum_init, + Index_type iend) +{ + HIP_DYNAMIC_SHARED(Real_type, psum); + + Index_type i = blockIdx.x * block_size + threadIdx.x; + + psum[ threadIdx.x ] = sum_init; + for ( ; i < iend ; i += gridDim.x * block_size ) { + psum[ threadIdx.x ] += x[i]; + } + __syncthreads(); + + for ( i = block_size / 2; i > 0; i /= 2 ) { + if ( threadIdx.x < i ) { + psum[ threadIdx.x ] += psum[ threadIdx.x + i ]; + } + __syncthreads(); + } + +#if 1 // serialized access to shared data; + if ( threadIdx.x == 0 ) { + RAJA::atomicAdd( dsum, psum[ 0 ] ); + } +#else // this doesn't work due to data races + if ( threadIdx.x == 0 ) { + *dsum += psum[ 0 ]; + } +#endif +} + void REDUCE_SUM::runHipVariantRocprim(VariantID vid) { @@ -134,7 +167,38 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) REDUCE_SUM_DATA_SETUP; - if ( vid == RAJA_HIP ) { + if ( vid == Base_HIP ) { + + REDUCE_SUM_DATA_SETUP_HIP; + + Real_ptr dsum; + allocHipDeviceData(dsum, 1); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + initHipDeviceData(dsum, &m_sum_init, 1); + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + hipLaunchKernelGGL( (reduce_sum), dim3(grid_size), dim3(block_size), + sizeof(Real_type)*block_size, 0, + x, dsum, m_sum_init, iend ); + hipErrchk( hipGetLastError() ); + + Real_type lsum; + Real_ptr plsum = &lsum; + getHipDeviceData(plsum, dsum, 1); + + m_sum = lsum; + + } + stopTimer(); + + deallocHipDeviceData(dsum); + + REDUCE_SUM_DATA_TEARDOWN_HIP; + + } else if ( vid == RAJA_HIP ) { REDUCE_SUM_DATA_SETUP_HIP; @@ -163,7 +227,20 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx) { if ( vid == Base_HIP ) { - runHipVariantRocprim(vid); + size_t t = 0; + if (tune_idx == t) { + runHipVariantRocprim(vid); + } + t += 1; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + if (tune_idx == t) { + runHipVariantBlock(vid); + } + t += 1; + } + }); } else if ( vid == RAJA_HIP ) { size_t t = 0; seq_for(gpu_block_sizes_type{}, [&](auto block_size) { @@ -188,6 +265,12 @@ void REDUCE_SUM::setHipTuningDefinitions(VariantID vid) #elif defined(__CUDACC__) addVariantTuningName(vid, "cub"); #endif + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); } else if ( vid == RAJA_HIP ) { seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || From bc6230d852af9dee300f376d16a72190f0bfe34e Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 6 Apr 2022 09:48:30 -0700 Subject: [PATCH 316/392] Improve whitespacing --- src/algorithm/REDUCE_SUM-Cuda.cpp | 52 ++++++++++++++++++++++++++++-- src/algorithm/REDUCE_SUM-Hip.cpp | 53 +++++++++++++++++++++++++++++-- 2 files changed, 99 insertions(+), 6 deletions(-) diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index a40bec0f1..c8b4bb8e4 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -127,8 +127,11 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) REDUCE_SUM_DATA_TEARDOWN_CUDA; } else { - getCout() << "\n REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl; + + getCout() << "\n REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl; + } + } template < size_t block_size > @@ -194,60 +197,103 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) REDUCE_SUM_DATA_TEARDOWN_CUDA; } else { - getCout() << "\n REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl; + + getCout() << "\n REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl; + } + } void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx) { if ( vid == Base_CUDA ) { + size_t t = 0; + if (tune_idx == t) { + runCudaVariantCub(vid); + } + t += 1; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { + if (tune_idx == t) { + runCudaVariantBlock(vid); + } + t += 1; + } + }); + } else if ( vid == RAJA_CUDA ) { + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { + if (tune_idx == t) { + runCudaVariantBlock(vid); + } + t += 1; + } + }); + } else { - getCout() << "\n REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl; + + getCout() << "\n REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl; + } + } void REDUCE_SUM::setCudaTuningDefinitions(VariantID vid) { if ( vid == Base_CUDA ) { + addVariantTuningName(vid, "cub"); + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); + } else if ( vid == RAJA_CUDA ) { + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); + } } diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index c230e4dc8..691db7fae 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -154,8 +154,11 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) REDUCE_SUM_DATA_TEARDOWN_HIP; } else { - getCout() << "\n REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl; + + getCout() << "\n REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl; + } + } template < size_t block_size > @@ -220,65 +223,109 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) REDUCE_SUM_DATA_TEARDOWN_HIP; } else { - getCout() << "\n REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl; + + getCout() << "\n REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl; + } + } void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx) { if ( vid == Base_HIP ) { + size_t t = 0; + if (tune_idx == t) { + runHipVariantRocprim(vid); + } + t += 1; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { + if (tune_idx == t) { + runHipVariantBlock(vid); + } + t += 1; + } + }); + } else if ( vid == RAJA_HIP ) { + size_t t = 0; + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { + if (tune_idx == t) { + runHipVariantBlock(vid); + } + t += 1; + } + }); + } else { - getCout() << "\n REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl; + + getCout() << "\n REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl; + } + } void REDUCE_SUM::setHipTuningDefinitions(VariantID vid) { if ( vid == Base_HIP ) { + #if defined(__HIPCC__) addVariantTuningName(vid, "rocprim"); #elif defined(__CUDACC__) addVariantTuningName(vid, "cub"); #endif + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); + } else if ( vid == RAJA_HIP ) { + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + }); + } + } } // end namespace algorithm From eccd47c785393ecfeda9d189bd13abfe36a0fdd2 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 6 Apr 2022 13:15:08 -0700 Subject: [PATCH 317/392] fix warning in INDEXLIST-OMP --- src/basic/INDEXLIST-OMP.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/basic/INDEXLIST-OMP.cpp b/src/basic/INDEXLIST-OMP.cpp index 7f6ddbadb..681e62699 100644 --- a/src/basic/INDEXLIST-OMP.cpp +++ b/src/basic/INDEXLIST-OMP.cpp @@ -198,6 +198,8 @@ void INDEXLIST::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ } +#else + RAJA_UNUSED_VAR(vid); #endif } From 7c7c0edf99d8b56141eede1419fb64d8cae3b36c Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Thu, 7 Apr 2022 13:36:09 -0500 Subject: [PATCH 318/392] fixing typo in tearDown prototype in REDUCE_STRUCT --- src/basic/REDUCE_STRUCT.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index d230e8e2c..0836d8c80 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -78,7 +78,7 @@ class REDUCE_STRUCT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); - void tearDown(VariantID vid), size_t tune_idx; + void tearDown(VariantID vid, size_t tune_idx); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); From 0b52788f987c696193ff25aceae843a907809b0a Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Thu, 7 Apr 2022 13:53:34 -0500 Subject: [PATCH 319/392] implementing pure virtual method runHipVariant for REDUCE_STRUCT --- src/basic/REDUCE_STRUCT.hpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index 0836d8c80..87722a4d2 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -83,13 +83,15 @@ class REDUCE_STRUCT : public KernelBase void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template < size_t block_size > - void runHipVariant(VariantID vid); + void runCudaVariantImpl(VariantID vid); template < size_t block_size > - void runOpenMPTargetVariant(VariantID vid); + void runHipVariantImpl(VariantID vid); private: From 710f9d4bfbd666b4ee0b118a6d720b2f4ec5453d Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Thu, 7 Apr 2022 14:55:47 -0500 Subject: [PATCH 320/392] fixing REDUCE_STRUCT kernel blocksize errors --- src/basic/REDUCE_STRUCT-Cuda.cpp | 2 +- src/basic/REDUCE_STRUCT-Hip.cpp | 7 ++++--- src/basic/REDUCE_STRUCT-OMP.cpp | 2 +- src/basic/REDUCE_STRUCT-Seq.cpp | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index a460c4a0c..f2da1288a 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -101,7 +101,7 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, } template < size_t block_size > -void REDUCE_STRUCT::runCudaVariant(VariantID vid) +void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index 8afb201ff..79eefa523 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -28,7 +28,8 @@ namespace basic #define REDUCE_STRUCT_DATA_TEARDOWN_HIP \ deallocHipDeviceData(particles.x); \ - deallocHipDeviceData(particles.y); \ + deallocHipDeviceData(particles.y); + template < size_t block_size > __launch_bounds__(block_size) __global__ void reduce_struct(Real_ptr x, Real_ptr y, @@ -101,7 +102,7 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, template < size_t block_size > -void REDUCE_STRUCT::runHipVariant(VariantID vid) +void REDUCE_STRUCT::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -121,7 +122,7 @@ void REDUCE_STRUCT::runHipVariant(VariantID vid) hipErrchk(hipMemsetAsync(mem, 0.0, 6*sizeof(Real_type))); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((reduce_struct), dim3(grid_size), dim3(block_size), 6*sizeof(Real_type)*block_size, 0, + hipLaunchKernelGGL((reduce_struct), dim3(grid_size), dim3(block_size), 6*sizeof(Real_type)*block_size, 0, particles.x, particles.y, mem, mem+1,mem+2, //xcenter,xmin,xmax mem+3,mem+4,mem+5, //ycenter,ymin,ymax diff --git a/src/basic/REDUCE_STRUCT-OMP.cpp b/src/basic/REDUCE_STRUCT-OMP.cpp index 27b6b52f3..0a881dba5 100644 --- a/src/basic/REDUCE_STRUCT-OMP.cpp +++ b/src/basic/REDUCE_STRUCT-OMP.cpp @@ -19,7 +19,7 @@ namespace basic { -void REDUCE_STRUCT::runOpenMPVariant(VariantID vid) +void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) diff --git a/src/basic/REDUCE_STRUCT-Seq.cpp b/src/basic/REDUCE_STRUCT-Seq.cpp index 23c72935f..44b9d881c 100644 --- a/src/basic/REDUCE_STRUCT-Seq.cpp +++ b/src/basic/REDUCE_STRUCT-Seq.cpp @@ -19,7 +19,7 @@ namespace basic { -void REDUCE_STRUCT::runSeqVariant(VariantID vid) +void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; From 472f4e9cc1b99e82c314415a2a144366f6f7845c Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Thu, 7 Apr 2022 16:09:12 -0400 Subject: [PATCH 321/392] Update REDUCE_STRUCT.hpp update code comments --- src/basic/REDUCE_STRUCT.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index 87722a4d2..4b1acce71 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -9,9 +9,10 @@ /// /// REDUCE_STRUCT kernel reference implementation: /// -/// Real_type xsum = 0.0; -/// Real_type xmin = 0.0; -/// Real_type xmax = 0.0; +/// Real_type xsum; Real_type ysum; +/// Real_type xmin; Real_type ymin; +/// Real_type xmax; Real_type ymax; + /// /// for (Index_type i = ibegin; i < iend; ++i ) { /// xsum += x[i] ; ysum += y[i] ; From bdca311df0cb19518ab8536e5866e4cff59becfa Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Thu, 7 Apr 2022 16:30:28 -0400 Subject: [PATCH 322/392] Update src/basic/REDUCE_STRUCT-OMP.cpp Co-authored-by: Jason Burmark --- src/basic/REDUCE_STRUCT-OMP.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/REDUCE_STRUCT-OMP.cpp b/src/basic/REDUCE_STRUCT-OMP.cpp index 0a881dba5..b83faaccd 100644 --- a/src/basic/REDUCE_STRUCT-OMP.cpp +++ b/src/basic/REDUCE_STRUCT-OMP.cpp @@ -84,7 +84,7 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t xsum += init_struct_x_base_lam(i); xmin = RAJA_MIN(xmin, init_struct_x_base_lam(i)); xmax = RAJA_MAX(xmax, init_struct_x_base_lam(i)); - ysum += init_struct_y_base_lam(i); + ysum += reduce_struct_y_base_lam(i); ymin = RAJA_MIN(ymin, init_struct_y_base_lam(i)); ymax = RAJA_MAX(ymax, init_struct_y_base_lam(i)); From d6476424eda1c175ec2766b25dde6915bdc59d85 Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Thu, 7 Apr 2022 16:30:35 -0400 Subject: [PATCH 323/392] Update src/basic/REDUCE_STRUCT-OMP.cpp Co-authored-by: Jason Burmark --- src/basic/REDUCE_STRUCT-OMP.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/REDUCE_STRUCT-OMP.cpp b/src/basic/REDUCE_STRUCT-OMP.cpp index b83faaccd..6fc880810 100644 --- a/src/basic/REDUCE_STRUCT-OMP.cpp +++ b/src/basic/REDUCE_STRUCT-OMP.cpp @@ -81,7 +81,7 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t reduction(max:ymax) for (Index_type i = ibegin; i < iend; ++i ) { - xsum += init_struct_x_base_lam(i); + xsum += reduce_struct_x_base_lam(i); xmin = RAJA_MIN(xmin, init_struct_x_base_lam(i)); xmax = RAJA_MAX(xmax, init_struct_x_base_lam(i)); ysum += reduce_struct_y_base_lam(i); From 1ebf62d78afae128ed1a9e65b3c82d64cfadcd45 Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Thu, 7 Apr 2022 16:30:51 -0400 Subject: [PATCH 324/392] Update src/basic/REDUCE_STRUCT-Cuda.cpp Co-authored-by: Rich Hornung --- src/basic/REDUCE_STRUCT-Cuda.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index f2da1288a..0c8204a5c 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // From 8e12aea0e1a40a4dd8f47425dc6b8c7543562bc5 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 7 Apr 2022 14:45:38 -0700 Subject: [PATCH 325/392] Fix file header. --- .gitlab-ci.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index bd630d925..942920275 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,6 +1,7 @@ ############################################################################### -# Copyright (c) 2016-2020, Lawrence Livermore National Security, LLC -# and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC +# and RAJA Performance Suite project contributors. +# See the RAJAPerf/COPYRIGHT file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### From 57537228ce38802dac7ecb35afa5b2735029459a Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Fri, 8 Apr 2022 12:07:33 -0500 Subject: [PATCH 326/392] removing extra / in REDUCE_STRUCT-Cuda definitions --- src/basic/REDUCE_STRUCT-Cuda.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 0c8204a5c..6cc4c3eb2 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -29,7 +29,8 @@ namespace basic #define REDUCE_STRUCT_DATA_TEARDOWN_CUDA \ deallocCudaDeviceData(particles.x); \ - deallocCudaDeviceData(particles.y); \ + deallocCudaDeviceData(particles.y); + template < size_t block_size > __launch_bounds__(block_size) __global__ void reduce_struct(Real_ptr x, Real_ptr y, From 87ed7932eda31c0eff0a3b59cf4fa045ac87052a Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Fri, 8 Apr 2022 12:36:20 -0500 Subject: [PATCH 327/392] fix incorrect init variables in REDUCE_STRUCT-OMP.cpp --- src/basic/REDUCE_STRUCT-OMP.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-OMP.cpp b/src/basic/REDUCE_STRUCT-OMP.cpp index 6fc880810..0a881dba5 100644 --- a/src/basic/REDUCE_STRUCT-OMP.cpp +++ b/src/basic/REDUCE_STRUCT-OMP.cpp @@ -81,10 +81,10 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t reduction(max:ymax) for (Index_type i = ibegin; i < iend; ++i ) { - xsum += reduce_struct_x_base_lam(i); + xsum += init_struct_x_base_lam(i); xmin = RAJA_MIN(xmin, init_struct_x_base_lam(i)); xmax = RAJA_MAX(xmax, init_struct_x_base_lam(i)); - ysum += reduce_struct_y_base_lam(i); + ysum += init_struct_y_base_lam(i); ymin = RAJA_MIN(ymin, init_struct_y_base_lam(i)); ymax = RAJA_MAX(ymax, init_struct_y_base_lam(i)); From 0bb2684a8f92c6f61da92668b76970057aeb5d09 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Fri, 8 Apr 2022 12:42:31 -0500 Subject: [PATCH 328/392] update cuda kernel definition for blocksize template --- src/basic/REDUCE_STRUCT-Cuda.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 6cc4c3eb2..f9714429d 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -122,7 +122,7 @@ void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid) cudaErrchk(cudaMemsetAsync(mem, 0.0, 6*sizeof(Real_type))); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - reduce_struct<<>> + reduce_struct<<>> (particles.x, particles.y, mem, mem+1,mem+2, //xcenter,xmin,xmax mem+3,mem+4,mem+5, //ycenter,ymin,ymax From 119af4bf89fe2da46b322f51653a7eb1c13aefa1 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 8 Apr 2022 10:43:31 -0700 Subject: [PATCH 329/392] Fixed copyright dates, indentation, and missing paren. --- src/basic/REDUCE_STRUCT-Hip.cpp | 2 +- src/basic/REDUCE_STRUCT-OMP.cpp | 40 +++++++++++++++------------ src/basic/REDUCE_STRUCT-OMPTarget.cpp | 2 +- src/basic/REDUCE_STRUCT-Seq.cpp | 2 +- src/basic/REDUCE_STRUCT.cpp | 2 +- src/basic/REDUCE_STRUCT.hpp | 2 +- 6 files changed, 28 insertions(+), 22 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index 79eefa523..1d8e8c9ef 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE_STRUCT-OMP.cpp b/src/basic/REDUCE_STRUCT-OMP.cpp index 0a881dba5..7758f3132 100644 --- a/src/basic/REDUCE_STRUCT-OMP.cpp +++ b/src/basic/REDUCE_STRUCT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // @@ -49,10 +49,12 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t for (Index_type i = ibegin; i < iend; ++i ) { REDUCE_STRUCT_BODY; } - particles.SetCenter(xsum/particles.N,ysum/particles.N); - particles.SetXMin(xmin); particles.SetXMax(xmax); - particles.SetYMin(ymin); particles.SetYMax(ymax); - m_particles=particles; + + particles.SetCenter(xsum/particles.N,ysum/particles.N); + particles.SetXMin(xmin); particles.SetXMax(xmax); + particles.SetYMin(ymin); particles.SetYMax(ymax); + m_particles=particles; + } stopTimer(); @@ -67,6 +69,7 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t auto reduce_struct_y_base_lam = [=](Index_type i) -> Real_type { return particles.y[i]; }; + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -80,19 +83,21 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t reduction(min:ymin), \ reduction(max:ymax) - for (Index_type i = ibegin; i < iend; ++i ) { - xsum += init_struct_x_base_lam(i); - xmin = RAJA_MIN(xmin, init_struct_x_base_lam(i)); - xmax = RAJA_MAX(xmax, init_struct_x_base_lam(i)); - ysum += init_struct_y_base_lam(i); - ymin = RAJA_MIN(ymin, init_struct_y_base_lam(i)); - ymax = RAJA_MAX(ymax, init_struct_y_base_lam(i)); + for (Index_type i = ibegin; i < iend; ++i ) { + xsum += init_struct_x_base_lam(i); + xmin = RAJA_MIN(xmin, init_struct_x_base_lam(i)); + xmax = RAJA_MAX(xmax, init_struct_x_base_lam(i)); + ysum += init_struct_y_base_lam(i); + ymin = RAJA_MIN(ymin, init_struct_y_base_lam(i)); + ymax = RAJA_MAX(ymax, init_struct_y_base_lam(i)); + } + + particles.SetCenter(xsum/particles.N,ysum/particles.N); + particles.SetXMin(xmin); particles.SetXMax(xmax); + particles.SetYMin(ymin); particles.SetYMax(ymax); + m_particles=particles; - } - particles.SetCenter(xsum/particles.N,ysum/particles.N); - particles.SetXMin(xmin); particles.SetXMax(xmax); - particles.SetYMin(ymin); particles.SetYMax(ymax); - m_particles=particles; + } stopTimer(); break; @@ -116,6 +121,7 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t particles.SetXMin(static_cast(xmin.get())); particles.SetYMin(static_cast(xmax.get())); particles.SetYMax(static_cast(ymax.get())); particles.SetYMax(static_cast(ymax.get())); m_particles=particles; + } stopTimer(); diff --git a/src/basic/REDUCE_STRUCT-OMPTarget.cpp b/src/basic/REDUCE_STRUCT-OMPTarget.cpp index fe6d157a9..63fd713cc 100644 --- a/src/basic/REDUCE_STRUCT-OMPTarget.cpp +++ b/src/basic/REDUCE_STRUCT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE_STRUCT-Seq.cpp b/src/basic/REDUCE_STRUCT-Seq.cpp index 44b9d881c..24ffb9759 100644 --- a/src/basic/REDUCE_STRUCT-Seq.cpp +++ b/src/basic/REDUCE_STRUCT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp index b4c3906e5..8b6bde26c 100644 --- a/src/basic/REDUCE_STRUCT.cpp +++ b/src/basic/REDUCE_STRUCT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index 4b1acce71..dfbfbb4e4 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // From 171d27e5eff05a9a518729ff5ff07afdca67556d Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 8 Apr 2022 11:14:50 -0700 Subject: [PATCH 330/392] Fix lambda names for consistency and compilation errors, indentation & spacing --- src/basic/REDUCE_STRUCT-OMP.cpp | 7 ++-- src/basic/REDUCE_STRUCT-Seq.cpp | 73 ++++++++++++++++++--------------- 2 files changed, 45 insertions(+), 35 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-OMP.cpp b/src/basic/REDUCE_STRUCT-OMP.cpp index 7758f3132..5bf3c5a4a 100644 --- a/src/basic/REDUCE_STRUCT-OMP.cpp +++ b/src/basic/REDUCE_STRUCT-OMP.cpp @@ -66,6 +66,7 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t auto reduce_struct_x_base_lam = [=](Index_type i) -> Real_type { return particles.x[i]; }; + auto reduce_struct_y_base_lam = [=](Index_type i) -> Real_type { return particles.y[i]; }; @@ -76,18 +77,18 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t Real_type xsum = 0.0, ysum = 0.0; Real_type xmin = 0.0, ymin = 0.0; Real_type xmax = 0.0, ymax = 0.0; + #pragma omp parallel for reduction(+:xsum), \ reduction(min:xmin), \ reduction(max:xmax), \ reduction(+:ysum), \ reduction(min:ymin), \ reduction(max:ymax) - for (Index_type i = ibegin; i < iend; ++i ) { - xsum += init_struct_x_base_lam(i); + xsum += reduce_struct_x_base_lam(i); xmin = RAJA_MIN(xmin, init_struct_x_base_lam(i)); xmax = RAJA_MAX(xmax, init_struct_x_base_lam(i)); - ysum += init_struct_y_base_lam(i); + ysum += reduce_struct_y_base_lam(i); ymin = RAJA_MIN(ymin, init_struct_y_base_lam(i)); ymax = RAJA_MAX(ymax, init_struct_y_base_lam(i)); } diff --git a/src/basic/REDUCE_STRUCT-Seq.cpp b/src/basic/REDUCE_STRUCT-Seq.cpp index 24ffb9759..7fff0f51a 100644 --- a/src/basic/REDUCE_STRUCT-Seq.cpp +++ b/src/basic/REDUCE_STRUCT-Seq.cpp @@ -30,21 +30,23 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune switch ( vid ) { case Base_Seq : { + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Real_type xsum = 0.0; Real_type ysum = 0.0; - Real_type xmin = 0.0; Real_type ymin = 0.0; - Real_type xmax = 0.0; Real_type ymax = 0.0; + Real_type xsum = 0.0; Real_type ysum = 0.0; + Real_type xmin = 0.0; Real_type ymin = 0.0; + Real_type xmax = 0.0; Real_type ymax = 0.0; - for (Index_type i = ibegin; i < iend; ++i ) { - REDUCE_STRUCT_BODY; - } + for (Index_type i = ibegin; i < iend; ++i ) { + REDUCE_STRUCT_BODY; + } + + particles.SetCenter(xsum/(particles.N),ysum/(particles.N)); + particles.SetXMin(xmin); particles.SetXMax(xmax); + particles.SetYMin(ymin); particles.SetYMax(ymax); + m_particles=particles; - particles.SetCenter(xsum/(particles.N),ysum/(particles.N)); - particles.SetXMin(xmin); particles.SetXMax(xmax); - particles.SetYMin(ymin); particles.SetYMax(ymax); - m_particles=particles; } stopTimer(); @@ -53,31 +55,36 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { - auto init_struct_x_base_lam = [=](Index_type i) -> Real_type { - return particles.x[i]; - }; - auto init_struct_y_base_lam = [=](Index_type i) -> Real_type { - return particles.y[i]; - }; + + auto reduce_struct_x_base_lam = [=](Index_type i) -> Real_type { + return particles.x[i]; + }; + + auto reduce_struct_y_base_lam = [=](Index_type i) -> Real_type { + return particles.y[i]; + }; + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Real_type xsum = 0.0; Real_type ysum = 0.0; - Real_type xmin = 0.0; Real_type ymin = 0.0; - Real_type xmax = 0.0; Real_type ymax = 0.0; - - for (Index_type i = ibegin; i < iend; ++i ) { - xsum += init_struct_x_base_lam(i); - xmin = RAJA_MIN(xmin, init_struct_x_base_lam(i)); - xmax = RAJA_MAX(xmax, init_struct_x_base_lam(i)); - ysum += init_struct_y_base_lam(i); - ymin = RAJA_MIN(ymin, init_struct_y_base_lam(i)); - ymax = RAJA_MAX(ymax, init_struct_y_base_lam(i)); - } - particles.SetCenter(xsum/(particles.N),ysum/(particles.N)); - particles.SetXMin(xmin); particles.SetXMax(xmax); - particles.SetYMin(ymin); particles.SetYMax(ymax); - m_particles=particles; + Real_type xsum = 0.0; Real_type ysum = 0.0; + Real_type xmin = 0.0; Real_type ymin = 0.0; + Real_type xmax = 0.0; Real_type ymax = 0.0; + + for (Index_type i = ibegin; i < iend; ++i ) { + xsum += reduce_struct_x_base_lam(i); + xmin = RAJA_MIN(xmin, init_struct_x_base_lam(i)); + xmax = RAJA_MAX(xmax, init_struct_x_base_lam(i)); + ysum += reduce_struct_y_base_lam(i); + ymin = RAJA_MIN(ymin, init_struct_y_base_lam(i)); + ymax = RAJA_MAX(ymax, init_struct_y_base_lam(i)); + } + + particles.SetCenter(xsum/(particles.N),ysum/(particles.N)); + particles.SetXMin(xmin); particles.SetXMax(xmax); + particles.SetYMin(ymin); particles.SetYMax(ymax); + m_particles=particles; + } stopTimer(); @@ -85,6 +92,7 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune } case RAJA_Seq : { + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -101,6 +109,7 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune particles.SetXMin(static_cast(xmin.get())); particles.SetXMax(static_cast(xmax.get())); particles.SetYMin(static_cast(ymin.get())); particles.SetYMax(static_cast(ymax.get())); m_particles=particles; + } stopTimer(); From 2ef4e31d0640600b226002c0f0dab3847a12678d Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 8 Apr 2022 11:26:02 -0700 Subject: [PATCH 331/392] Fix lambda names. --- src/basic/REDUCE_STRUCT-OMP.cpp | 8 ++++---- src/basic/REDUCE_STRUCT-Seq.cpp | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-OMP.cpp b/src/basic/REDUCE_STRUCT-OMP.cpp index 5bf3c5a4a..87cd93244 100644 --- a/src/basic/REDUCE_STRUCT-OMP.cpp +++ b/src/basic/REDUCE_STRUCT-OMP.cpp @@ -86,11 +86,11 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t reduction(max:ymax) for (Index_type i = ibegin; i < iend; ++i ) { xsum += reduce_struct_x_base_lam(i); - xmin = RAJA_MIN(xmin, init_struct_x_base_lam(i)); - xmax = RAJA_MAX(xmax, init_struct_x_base_lam(i)); + xmin = RAJA_MIN(xmin, reduce_struct_x_base_lam(i)); + xmax = RAJA_MAX(xmax, reduce_struct_x_base_lam(i)); ysum += reduce_struct_y_base_lam(i); - ymin = RAJA_MIN(ymin, init_struct_y_base_lam(i)); - ymax = RAJA_MAX(ymax, init_struct_y_base_lam(i)); + ymin = RAJA_MIN(ymin, reduce_struct_y_base_lam(i)); + ymax = RAJA_MAX(ymax, reduce_struct_y_base_lam(i)); } particles.SetCenter(xsum/particles.N,ysum/particles.N); diff --git a/src/basic/REDUCE_STRUCT-Seq.cpp b/src/basic/REDUCE_STRUCT-Seq.cpp index 7fff0f51a..741ae6ea5 100644 --- a/src/basic/REDUCE_STRUCT-Seq.cpp +++ b/src/basic/REDUCE_STRUCT-Seq.cpp @@ -73,11 +73,11 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune for (Index_type i = ibegin; i < iend; ++i ) { xsum += reduce_struct_x_base_lam(i); - xmin = RAJA_MIN(xmin, init_struct_x_base_lam(i)); - xmax = RAJA_MAX(xmax, init_struct_x_base_lam(i)); + xmin = RAJA_MIN(xmin, reduce_struct_x_base_lam(i)); + xmax = RAJA_MAX(xmax, reduce_struct_x_base_lam(i)); ysum += reduce_struct_y_base_lam(i); - ymin = RAJA_MIN(ymin, init_struct_y_base_lam(i)); - ymax = RAJA_MAX(ymax, init_struct_y_base_lam(i)); + ymin = RAJA_MIN(ymin, reduce_struct_y_base_lam(i)); + ymax = RAJA_MAX(ymax, reduce_struct_y_base_lam(i)); } particles.SetCenter(xsum/(particles.N),ysum/(particles.N)); From bf5d30c3c400a6538c2c64a682a16269ad1759f1 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Fri, 8 Apr 2022 13:34:20 -0500 Subject: [PATCH 332/392] point OMP RAJA body to correct macro name --- src/basic/REDUCE_STRUCT-OMP.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/REDUCE_STRUCT-OMP.cpp b/src/basic/REDUCE_STRUCT-OMP.cpp index 87cd93244..1999d1b58 100644 --- a/src/basic/REDUCE_STRUCT-OMP.cpp +++ b/src/basic/REDUCE_STRUCT-OMP.cpp @@ -115,7 +115,7 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t RAJA::forall( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - REDUCE_REAL_BODY_RAJA; + REDUCE_STRUCT_BODY_RAJA; }); particles.SetCenter(static_cast(xsum.get()/(particles.N)),ysum.get()/(particles.N)); From e27ffa380906f744ba0e2810a5cbd70d4bc6acd7 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 8 Apr 2022 11:35:58 -0700 Subject: [PATCH 333/392] Code formatting --- src/basic/REDUCE_STRUCT-Cuda.cpp | 25 +++++++++++++++--------- src/basic/REDUCE_STRUCT-Hip.cpp | 28 +++++++++++++++++---------- src/basic/REDUCE_STRUCT-OMPTarget.cpp | 3 +++ 3 files changed, 37 insertions(+), 19 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index f9714429d..ef324bd73 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -114,19 +114,21 @@ void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid) REDUCE_STRUCT_DATA_SETUP_CUDA; - Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax - allocCudaDeviceData(mem,6); + Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax + allocCudaDeviceData(mem,6); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + cudaErrchk(cudaMemsetAsync(mem, 0.0, 6*sizeof(Real_type))); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - reduce_struct<<>> - (particles.x, particles.y, - mem, mem+1,mem+2, //xcenter,xmin,xmax - mem+3,mem+4,mem+5, //ycenter,ymin,ymax - particles.N); + reduce_struct<<>>( + particles.x, particles.y, + mem, mem+1, mem+2, // xcenter,xmin,xmax + mem+3, mem+4, mem+5, // ycenter,ymin,ymax + particles.N); cudaErrchk( cudaGetLastError() ); Real_type lmem[6]={0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; @@ -139,6 +141,7 @@ void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid) particles.SetYMin(lmem[4]); particles.SetYMax(lmem[5]); m_particles=particles; + } stopTimer(); @@ -158,14 +161,15 @@ void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid) RAJA::ReduceMax xmax=0.0, ymax=0.0; RAJA::forall< RAJA::cuda_exec >( - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE_STRUCT_BODY_RAJA; + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE_STRUCT_BODY_RAJA; }); particles.SetCenter(static_cast(xsum.get()/(particles.N)),ysum.get()/(particles.N)); particles.SetXMin(static_cast(xmin.get())); particles.SetXMax(static_cast(xmax.get())); particles.SetYMin(static_cast(ymin.get())); particles.SetYMax(static_cast(ymax.get())); m_particles=particles; + } stopTimer(); @@ -174,8 +178,11 @@ void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid) } else { getCout() << "\n REDUCE_STRUCT : Unknown CUDA variant id = " << vid << std::endl; } + } + RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(REDUCE_STRUCT, Cuda) + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index 1d8e8c9ef..feb6810d2 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -114,20 +114,23 @@ void REDUCE_STRUCT::runHipVariantImpl(VariantID vid) REDUCE_STRUCT_DATA_SETUP_HIP; - Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax - allocHipDeviceData(mem,6); + Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax + allocHipDeviceData(mem,6); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { hipErrchk(hipMemsetAsync(mem, 0.0, 6*sizeof(Real_type))); + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((reduce_struct), dim3(grid_size), dim3(block_size), 6*sizeof(Real_type)*block_size, 0, - particles.x, particles.y, - mem, mem+1,mem+2, //xcenter,xmin,xmax - mem+3,mem+4,mem+5, //ycenter,ymin,ymax - particles.N); + hipLaunchKernelGGL((reduce_struct), + dim3(grid_size), dim3(block_size), + 6*sizeof(Real_type)*block_size, 0, + particles.x, particles.y, + mem, mem+1, mem+2, // xcenter,xmin,xmax + mem+3, mem+4, mem+5, // ycenter,ymin,ymax + particles.N); hipErrchk( hipGetLastError() ); Real_type lmem[6]={0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; @@ -140,6 +143,7 @@ void REDUCE_STRUCT::runHipVariantImpl(VariantID vid) particles.SetYMin(lmem[4]); particles.SetYMax(lmem[5]); m_particles=particles; + } stopTimer(); @@ -159,8 +163,8 @@ void REDUCE_STRUCT::runHipVariantImpl(VariantID vid) RAJA::ReduceMax xmax(0.0), ymax(0.0); RAJA::forall< RAJA::hip_exec >( - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE_STRUCT_BODY_RAJA; + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE_STRUCT_BODY_RAJA; }); @@ -168,6 +172,7 @@ void REDUCE_STRUCT::runHipVariantImpl(VariantID vid) particles.SetXMin(static_cast(xmin.get())); particles.SetXMax(static_cast(xmax.get())); particles.SetYMin(static_cast(ymin.get())); particles.SetYMax(static_cast(ymax.get())); m_particles=particles; + } stopTimer(); @@ -176,8 +181,11 @@ void REDUCE_STRUCT::runHipVariantImpl(VariantID vid) } else { getCout() << "\n REDUCE_STRUCT : Unknown Hip variant id = " << vid << std::endl; } + } - RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(REDUCE_STRUCT, Hip) + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(REDUCE_STRUCT, Hip) + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE_STRUCT-OMPTarget.cpp b/src/basic/REDUCE_STRUCT-OMPTarget.cpp index 63fd713cc..3312657f8 100644 --- a/src/basic/REDUCE_STRUCT-OMPTarget.cpp +++ b/src/basic/REDUCE_STRUCT-OMPTarget.cpp @@ -68,10 +68,12 @@ void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid) for (Index_type i = ibegin; i < iend; ++i ) { REDUCE_STRUCT_BODY; } + particles.SetCenter(xsum/particles.N,ysum/particles.N); particles.SetXMin(xmin); particles.SetXMax(xmax); particles.SetYMin(ymin); particles.SetYMax(ymax); m_particles=particles; + } stopTimer(); @@ -107,6 +109,7 @@ void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid) } else { std::cout << "\n REDUCE_STRUCT : Unknown OMP Target variant id = " << vid << std::endl; } + } } // end namespace basic From bc4619b05113f549768813f8a83a0b019e3ac0fd Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 8 Apr 2022 11:52:27 -0700 Subject: [PATCH 334/392] Remove extraneous tabs and spaces --- src/CMakeLists.txt | 2 +- src/basic/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 61acd7d54..bc1bf6b77 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -114,7 +114,7 @@ blt_add_executable( basic/REDUCE3_INT-OMPTarget.cpp basic/REDUCE_STRUCT.cpp basic/REDUCE_STRUCT-Seq.cpp - basic/REDUCE_STRUCT-OMPTarget.cpp + basic/REDUCE_STRUCT-OMPTarget.cpp basic/TRAP_INT.cpp basic/TRAP_INT-Seq.cpp basic/TRAP_INT-OMPTarget.cpp diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index 0902ada59..ceeb1a502 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -97,7 +97,7 @@ blt_add_library( REDUCE_STRUCT-Hip.cpp REDUCE_STRUCT-Cuda.cpp REDUCE_STRUCT-OMP.cpp - REDUCE_STRUCT-OMPTarget.cpp + REDUCE_STRUCT-OMPTarget.cpp TRAP_INT.cpp TRAP_INT-Seq.cpp TRAP_INT-Hip.cpp From 768bb60cd7b0d1955c9188017117f6f594d1ba83 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 8 Apr 2022 12:48:45 -0700 Subject: [PATCH 335/392] Fix reduction declaration/init issues and make consistent across variants --- src/basic/REDUCE_STRUCT-Cuda.cpp | 21 ++++++++++++--------- src/basic/REDUCE_STRUCT-Hip.cpp | 10 ++++++---- src/basic/REDUCE_STRUCT-OMP.cpp | 11 +++++++---- src/basic/REDUCE_STRUCT-OMPTarget.cpp | 9 ++++++--- src/basic/REDUCE_STRUCT-Seq.cpp | 9 ++++++--- 5 files changed, 37 insertions(+), 23 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index ef324bd73..d32f1cee5 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -91,13 +91,13 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, // serialized access to shared data; if ( threadIdx.x == 0 ) { - RAJA::atomicAdd( xsum, pxsum[ 0 ] ); - RAJA::atomicMin( xmin, pxmin[ 0 ] ); - RAJA::atomicMax( xmax, pxmax[ 0 ] ); + RAJA::atomicAdd( xsum, pxsum[ 0 ] ); + RAJA::atomicMin( xmin, pxmin[ 0 ] ); + RAJA::atomicMax( xmax, pxmax[ 0 ] ); - RAJA::atomicAdd( xsum, pysum[ 0 ] ); - RAJA::atomicMin( ymin, pymin[ 0 ] ); - RAJA::atomicMax( ymax, pymax[ 0 ] ); + RAJA::atomicAdd( xsum, pysum[ 0 ] ); + RAJA::atomicMin( ymin, pymin[ 0 ] ); + RAJA::atomicMax( ymax, pymax[ 0 ] ); } } @@ -156,9 +156,12 @@ void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum xsum=0.0, ysum=0.0; - RAJA::ReduceMin xmin=0.0, ymin=0.0; - RAJA::ReduceMax xmax=0.0, ymax=0.0; + RAJA::ReduceSum xsum(0.0); + RAJA::ReduceSum ysum(0.0); + RAJA::ReduceMin xmin(0.0); + RAJA::ReduceMin ymin(0.0); + RAJA::ReduceMax xmax(0.0); + RAJA::ReduceMax ymax(0.0); RAJA::forall< RAJA::cuda_exec >( RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index feb6810d2..fcb0b7093 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -158,16 +158,18 @@ void REDUCE_STRUCT::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum xsum(0.0), ysum(0.0); - RAJA::ReduceMin xmin(0.0), ymin(0.0); - RAJA::ReduceMax xmax(0.0), ymax(0.0); + RAJA::ReduceSum xsum(0.0); + RAJA::ReduceSum ysum(0.0); + RAJA::ReduceMin xmin(0.0); + RAJA::ReduceMin ymin(0.0); + RAJA::ReduceMax xmax(0.0); + RAJA::ReduceMax ymax(0.0); RAJA::forall< RAJA::hip_exec >( RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { REDUCE_STRUCT_BODY_RAJA; }); - particles.SetCenter(static_cast(xsum.get()/(particles.N)),ysum.get()/(particles.N)); particles.SetXMin(static_cast(xmin.get())); particles.SetXMax(static_cast(xmax.get())); particles.SetYMin(static_cast(ymin.get())); particles.SetYMax(static_cast(ymax.get())); diff --git a/src/basic/REDUCE_STRUCT-OMP.cpp b/src/basic/REDUCE_STRUCT-OMP.cpp index 1999d1b58..dfc00e82c 100644 --- a/src/basic/REDUCE_STRUCT-OMP.cpp +++ b/src/basic/REDUCE_STRUCT-OMP.cpp @@ -108,10 +108,13 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum xsum(0.0), ysum(0.0); - RAJA::ReduceMin xmin(0.0), ymin(0.0); - RAJA::ReduceMax xmax(0.0), ymax(0.0); + + RAJA::ReduceSum xsum(0.0); + RAJA::ReduceSum ysum(0.0); + RAJA::ReduceMin xmin(0.0); + RAJA::ReduceMin ymin(0.0); + RAJA::ReduceMax xmax(0.0); + RAJA::ReduceMax ymax(0.0); RAJA::forall( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { diff --git a/src/basic/REDUCE_STRUCT-OMPTarget.cpp b/src/basic/REDUCE_STRUCT-OMPTarget.cpp index 3312657f8..4a5542e6f 100644 --- a/src/basic/REDUCE_STRUCT-OMPTarget.cpp +++ b/src/basic/REDUCE_STRUCT-OMPTarget.cpp @@ -86,9 +86,12 @@ void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum xsum(0.0), ysum(0.0); - RAJA::ReduceMin xmin(0.0), ymin(0.0); - RAJA::ReduceMax xmax(0.0), ymax(0.0); + RAJA::ReduceSum xsum(0.0); + RAJA::ReduceSum ysum(0.0); + RAJA::ReduceMin xmin(0.0); + RAJA::ReduceMin ymin(0.0); + RAJA::ReduceMax xmax(0.0); + RAJA::ReduceMax ymax(0.0); RAJA::forall>( RAJA::RangeSegment(ibegin, iend), diff --git a/src/basic/REDUCE_STRUCT-Seq.cpp b/src/basic/REDUCE_STRUCT-Seq.cpp index 741ae6ea5..7e91ca722 100644 --- a/src/basic/REDUCE_STRUCT-Seq.cpp +++ b/src/basic/REDUCE_STRUCT-Seq.cpp @@ -96,9 +96,12 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum xsum(0.0), ysum(0.0); - RAJA::ReduceMin xmin(0.0), ymin(0.0); - RAJA::ReduceMax xmax(0.0), ymax(0.0); + RAJA::ReduceSum xsum(0.0); + RAJA::ReduceSum ysum(0.0); + RAJA::ReduceMin xmin(0.0); + RAJA::ReduceMin ymin(0.0); + RAJA::ReduceMax xmax(0.0); + RAJA::ReduceMax ymax(0.0); RAJA::forall( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { From 71a244a42bc2c2c8da8256994425a1598fc33405 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Fri, 8 Apr 2022 15:40:50 -0500 Subject: [PATCH 336/392] moving struct in REDUCE_STRUCT test from prviate to public to avoid nvcc error --- src/basic/REDUCE_STRUCT.hpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index dfbfbb4e4..031e42de2 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -94,10 +94,6 @@ class REDUCE_STRUCT : public KernelBase template < size_t block_size > void runHipVariantImpl(VariantID vid); -private: - - static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; struct particles_t{ Int_type N; Real_ptr x, y; @@ -119,7 +115,9 @@ class REDUCE_STRUCT : public KernelBase Real_type xmin, xmax; Real_type ymin, ymax; }; - +private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_x; Real_ptr m_y; particles_t m_particles; Real_type X_MIN = 0.0, X_MAX = 100.0; From 85f1b9fe718824d456e9eb2a84928a70219ee641 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Tue, 12 Apr 2022 08:19:44 -0500 Subject: [PATCH 337/392] updatin REDUCE_STRUCT init to be consistent with other tests --- src/basic/REDUCE_STRUCT.cpp | 3 +++ src/basic/REDUCE_STRUCT.hpp | 3 +++ 2 files changed, 6 insertions(+) diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp index 8b6bde26c..65c669379 100644 --- a/src/basic/REDUCE_STRUCT.cpp +++ b/src/basic/REDUCE_STRUCT.cpp @@ -64,6 +64,9 @@ REDUCE_STRUCT::~REDUCE_STRUCT() void REDUCE_STRUCT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { + m_init_sum = 0.0; + m_init_min = std::numeric_limits::max(); + m_init_max = std::numeric_limits::lowest(); allocAndInitData(m_x, getActualProblemSize(), vid); allocAndInitData(m_y, getActualProblemSize(), vid); Real_type dx = Lx/(Real_type)(getActualProblemSize()); diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index 031e42de2..f1630123c 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -119,6 +119,9 @@ class REDUCE_STRUCT : public KernelBase static const size_t default_gpu_block_size = 256; using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_ptr m_x; Real_ptr m_y; + Real_type m_init_sum; + Real_type m_init_min; + Real_type m_init_max; particles_t m_particles; Real_type X_MIN = 0.0, X_MAX = 100.0; Real_type Y_MIN = 0.0, Y_MAX = 50.0; From 91361c93f14554e73042a7af3a443ebbd7f14112 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 14 Apr 2022 10:03:03 -0700 Subject: [PATCH 338/392] update raja for hip macro rework --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index dd9395777..c21ba6316 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit dd93957771d34a8bfad09db5e1a856d40f0c7fcf +Subproject commit c21ba63167d977365137f35250d6653390ac19df From 4df3d270d43b229fbde9624bffe25368c7b84bcd Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Fri, 15 Apr 2022 09:57:00 -0400 Subject: [PATCH 339/392] update initializations --- src/basic/REDUCE_STRUCT-Cuda.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index d32f1cee5..36d12b174 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -52,13 +52,13 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, Index_type i = blockIdx.x * blockDim.x + threadIdx.x; //x - pxsum[ threadIdx.x ] = 0.0; - pxmin[ threadIdx.x ] = std::numeric_limits::max(); - pxmax[ threadIdx.x ] = std::numeric_limits::min(); + pxsum[ threadIdx.x ] = m_init_sum; + pxmin[ threadIdx.x ] = m_init_min; + pxmax[ threadIdx.x ] = m_init_max; //y - pysum[ threadIdx.x ] = 0.0; - pymin[ threadIdx.x ] = std::numeric_limits::max(); - pymax[ threadIdx.x ] = std::numeric_limits::min(); + pysum[ threadIdx.x ] = m_init_sum; + pymin[ threadIdx.x ] = m_init_min; + pymax[ threadIdx.x ] = m_init_max; for ( ; i < iend ; i += gridDim.x * blockDim.x ) { @@ -156,12 +156,12 @@ void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum xsum(0.0); - RAJA::ReduceSum ysum(0.0); - RAJA::ReduceMin xmin(0.0); - RAJA::ReduceMin ymin(0.0); - RAJA::ReduceMax xmax(0.0); - RAJA::ReduceMax ymax(0.0); + RAJA::ReduceSum xsum(m_init_sum); + RAJA::ReduceSum ysum(m_init_sum); + RAJA::ReduceMin xmin(m_init_min); + RAJA::ReduceMin ymin(m_init_min); + RAJA::ReduceMax xmax(m_init_max); + RAJA::ReduceMax ymax(m_init_max); RAJA::forall< RAJA::cuda_exec >( RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { From a1de4df4ce7c29cf7914802a7097409a1a490e52 Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Fri, 15 Apr 2022 09:58:25 -0400 Subject: [PATCH 340/392] update initializations --- src/basic/REDUCE_STRUCT-Hip.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index fcb0b7093..f844f6ee9 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -51,13 +51,13 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, Index_type i = blockIdx.x * blockDim.x + threadIdx.x; //x - pxsum[ threadIdx.x ] = 0.0; - pxmin[ threadIdx.x ] = std::numeric_limits::max(); - pxmax[ threadIdx.x ] = std::numeric_limits::min(); + pxsum[ threadIdx.x ] = m_init_sum; + pxmin[ threadIdx.x ] = m_init_min; + pxmax[ threadIdx.x ] = m_init_max; //y - pysum[ threadIdx.x ] = 0.0; - pymin[ threadIdx.x ] = std::numeric_limits::max(); - pymax[ threadIdx.x ] = std::numeric_limits::min(); + pysum[ threadIdx.x ] = m_init_sum; + pymin[ threadIdx.x ] = m_init_min; + pymax[ threadIdx.x ] = m_init_max; for ( ; i < iend ; i += gridDim.x * blockDim.x ) { @@ -158,12 +158,12 @@ void REDUCE_STRUCT::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum xsum(0.0); - RAJA::ReduceSum ysum(0.0); - RAJA::ReduceMin xmin(0.0); - RAJA::ReduceMin ymin(0.0); - RAJA::ReduceMax xmax(0.0); - RAJA::ReduceMax ymax(0.0); + RAJA::ReduceSum xsum(m_init_sum); + RAJA::ReduceSum ysum(m_init_sum); + RAJA::ReduceMin xmin(m_init_min); + RAJA::ReduceMin ymin(m_init_min); + RAJA::ReduceMax xmax(m_init_max); + RAJA::ReduceMax ymax(m_init_max); RAJA::forall< RAJA::hip_exec >( RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { From 36300290058105262544cc4b5f5e091bdf925875 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Fri, 15 Apr 2022 10:23:59 -0500 Subject: [PATCH 341/392] updating REDUCE_STRUCT var naming --- src/basic/REDUCE_STRUCT-Cuda.cpp | 32 ++++++++--------- src/basic/REDUCE_STRUCT-Hip.cpp | 32 ++++++++--------- src/basic/REDUCE_STRUCT-OMP.cpp | 28 +++++++-------- src/basic/REDUCE_STRUCT-OMPTarget.cpp | 24 ++++++------- src/basic/REDUCE_STRUCT-Seq.cpp | 28 +++++++-------- src/basic/REDUCE_STRUCT.cpp | 12 +++---- src/basic/REDUCE_STRUCT.hpp | 52 +++++++++++++-------------- 7 files changed, 104 insertions(+), 104 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 36d12b174..6da22a2ef 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -23,13 +23,13 @@ namespace basic #define REDUCE_STRUCT_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(particles.x, m_x, particles.N); \ - allocAndInitCudaDeviceData(particles.y, m_y, particles.N); \ + allocAndInitCudaDeviceData(points.x, m_x, points.N); \ + allocAndInitCudaDeviceData(points.y, m_y, points.N); \ #define REDUCE_STRUCT_DATA_TEARDOWN_CUDA \ - deallocCudaDeviceData(particles.x); \ - deallocCudaDeviceData(particles.y); + deallocCudaDeviceData(points.x); \ + deallocCudaDeviceData(points.y); template < size_t block_size > __launch_bounds__(block_size) @@ -125,22 +125,22 @@ void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid) reduce_struct<<>>( - particles.x, particles.y, + points.x, points.y, mem, mem+1, mem+2, // xcenter,xmin,xmax mem+3, mem+4, mem+5, // ycenter,ymin,ymax - particles.N); + points.N); cudaErrchk( cudaGetLastError() ); Real_type lmem[6]={0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; Real_ptr plmem = &lmem[0]; getCudaDeviceData(plmem, mem, 6); - particles.SetCenter(lmem[0]/particles.N,lmem[3]/particles.N); - particles.SetXMin(lmem[1]); - particles.SetXMax(lmem[2]); - particles.SetYMin(lmem[4]); - particles.SetYMax(lmem[5]); - m_particles=particles; + points.SetCenter(lmem[0]/points.N,lmem[3]/points.N); + points.SetXMin(lmem[1]); + points.SetXMax(lmem[2]); + points.SetYMin(lmem[4]); + points.SetYMax(lmem[5]); + m_points=points; } stopTimer(); @@ -168,10 +168,10 @@ void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid) REDUCE_STRUCT_BODY_RAJA; }); - particles.SetCenter(static_cast(xsum.get()/(particles.N)),ysum.get()/(particles.N)); - particles.SetXMin(static_cast(xmin.get())); particles.SetXMax(static_cast(xmax.get())); - particles.SetYMin(static_cast(ymin.get())); particles.SetYMax(static_cast(ymax.get())); - m_particles=particles; + points.SetCenter(static_cast(xsum.get()/(points.N)),ysum.get()/(points.N)); + points.SetXMin(static_cast(xmin.get())); points.SetXMax(static_cast(xmax.get())); + points.SetYMin(static_cast(ymin.get())); points.SetYMax(static_cast(ymax.get())); + m_points=points; } stopTimer(); diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index f844f6ee9..e5b80f0bc 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -23,12 +23,12 @@ namespace basic #define REDUCE_STRUCT_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(particles.x, m_x, particles.N); \ - allocAndInitHipDeviceData(particles.y, m_y, particles.N); \ + allocAndInitHipDeviceData(points.x, m_x, points.N); \ + allocAndInitHipDeviceData(points.y, m_y, points.N); \ #define REDUCE_STRUCT_DATA_TEARDOWN_HIP \ - deallocHipDeviceData(particles.x); \ - deallocHipDeviceData(particles.y); + deallocHipDeviceData(points.x); \ + deallocHipDeviceData(points.y); template < size_t block_size > __launch_bounds__(block_size) @@ -127,22 +127,22 @@ void REDUCE_STRUCT::runHipVariantImpl(VariantID vid) hipLaunchKernelGGL((reduce_struct), dim3(grid_size), dim3(block_size), 6*sizeof(Real_type)*block_size, 0, - particles.x, particles.y, + points.x, points.y, mem, mem+1, mem+2, // xcenter,xmin,xmax mem+3, mem+4, mem+5, // ycenter,ymin,ymax - particles.N); + points.N); hipErrchk( hipGetLastError() ); Real_type lmem[6]={0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; Real_ptr plmem = &lmem[0]; getHipDeviceData(plmem, mem, 6); - particles.SetCenter(lmem[0]/particles.N,lmem[3]/particles.N); - particles.SetXMin(lmem[1]); - particles.SetXMax(lmem[2]); - particles.SetYMin(lmem[4]); - particles.SetYMax(lmem[5]); - m_particles=particles; + points.SetCenter(lmem[0]/points.N,lmem[3]/points.N); + points.SetXMin(lmem[1]); + points.SetXMax(lmem[2]); + points.SetYMin(lmem[4]); + points.SetYMax(lmem[5]); + m_points=points; } stopTimer(); @@ -170,10 +170,10 @@ void REDUCE_STRUCT::runHipVariantImpl(VariantID vid) REDUCE_STRUCT_BODY_RAJA; }); - particles.SetCenter(static_cast(xsum.get()/(particles.N)),ysum.get()/(particles.N)); - particles.SetXMin(static_cast(xmin.get())); particles.SetXMax(static_cast(xmax.get())); - particles.SetYMin(static_cast(ymin.get())); particles.SetYMax(static_cast(ymax.get())); - m_particles=particles; + points.SetCenter(static_cast(xsum.get()/(points.N)),ysum.get()/(points.N)); + points.SetXMin(static_cast(xmin.get())); points.SetXMax(static_cast(xmax.get())); + points.SetYMin(static_cast(ymin.get())); points.SetYMax(static_cast(ymax.get())); + m_points=points; } stopTimer(); diff --git a/src/basic/REDUCE_STRUCT-OMP.cpp b/src/basic/REDUCE_STRUCT-OMP.cpp index dfc00e82c..960aab266 100644 --- a/src/basic/REDUCE_STRUCT-OMP.cpp +++ b/src/basic/REDUCE_STRUCT-OMP.cpp @@ -50,10 +50,10 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t REDUCE_STRUCT_BODY; } - particles.SetCenter(xsum/particles.N,ysum/particles.N); - particles.SetXMin(xmin); particles.SetXMax(xmax); - particles.SetYMin(ymin); particles.SetYMax(ymax); - m_particles=particles; + points.SetCenter(xsum/points.N,ysum/points.N); + points.SetXMin(xmin); points.SetXMax(xmax); + points.SetYMin(ymin); points.SetYMax(ymax); + m_points=points; } stopTimer(); @@ -64,11 +64,11 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t case Lambda_OpenMP : { auto reduce_struct_x_base_lam = [=](Index_type i) -> Real_type { - return particles.x[i]; + return points.x[i]; }; auto reduce_struct_y_base_lam = [=](Index_type i) -> Real_type { - return particles.y[i]; + return points.y[i]; }; startTimer(); @@ -93,10 +93,10 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t ymax = RAJA_MAX(ymax, reduce_struct_y_base_lam(i)); } - particles.SetCenter(xsum/particles.N,ysum/particles.N); - particles.SetXMin(xmin); particles.SetXMax(xmax); - particles.SetYMin(ymin); particles.SetYMax(ymax); - m_particles=particles; + points.SetCenter(xsum/points.N,ysum/points.N); + points.SetXMin(xmin); points.SetXMax(xmax); + points.SetYMin(ymin); points.SetYMax(ymax); + m_points=points; } stopTimer(); @@ -121,10 +121,10 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t REDUCE_STRUCT_BODY_RAJA; }); - particles.SetCenter(static_cast(xsum.get()/(particles.N)),ysum.get()/(particles.N)); - particles.SetXMin(static_cast(xmin.get())); particles.SetYMin(static_cast(xmax.get())); - particles.SetYMax(static_cast(ymax.get())); particles.SetYMax(static_cast(ymax.get())); - m_particles=particles; + points.SetCenter(static_cast(xsum.get()/(points.N)),ysum.get()/(points.N)); + points.SetXMin(static_cast(xmin.get())); points.SetYMin(static_cast(xmax.get())); + points.SetYMax(static_cast(ymax.get())); points.SetYMax(static_cast(ymax.get())); + m_points=points; } stopTimer(); diff --git a/src/basic/REDUCE_STRUCT-OMPTarget.cpp b/src/basic/REDUCE_STRUCT-OMPTarget.cpp index 4a5542e6f..6445b8866 100644 --- a/src/basic/REDUCE_STRUCT-OMPTarget.cpp +++ b/src/basic/REDUCE_STRUCT-OMPTarget.cpp @@ -30,12 +30,12 @@ namespace basic int hid = omp_get_initial_device(); \ int did = omp_get_default_device(); \ \ - allocAndInitHipDeviceData(particles.x, m_x, particles.N, did, hid); \ - allocAndInitHipDeviceData(particles.y, m_y, particles.N, did, hid); + allocAndInitHipDeviceData(points.x, m_x, points.N, did, hid); \ + allocAndInitHipDeviceData(points.y, m_y, points.N, did, hid); #define REDUCE_STRUCT_DATA_TEARDOWN_OMP_TARGET \ - deallocHipDeviceData(particles.x); \ - deallocHipDeviceData(particles.y); \ + deallocHipDeviceData(points.x); \ + deallocHipDeviceData(points.y); \ void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid) @@ -69,10 +69,10 @@ void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid) REDUCE_STRUCT_BODY; } - particles.SetCenter(xsum/particles.N,ysum/particles.N); - particles.SetXMin(xmin); particles.SetXMax(xmax); - particles.SetYMin(ymin); particles.SetYMax(ymax); - m_particles=particles; + points.SetCenter(xsum/points.N,ysum/points.N); + points.SetXMin(xmin); points.SetXMax(xmax); + points.SetYMin(ymin); points.SetYMax(ymax); + m_points=points; } stopTimer(); @@ -99,10 +99,10 @@ void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid) REDUCE_STRUCT_BODY_RAJA; }); - particles.SetCenter(static_cast(xsum.get()/(particles.N)),ysum.get()/(particles.N)); - particles.SetXMin(static_cast(xmin.get())); particles.SetYMin(static_cast(xmax.get())); - particles.SetYMax(static_cast(ymax.get())); particles.SetYMax(static_cast(ymax.get())); - m_particles=particles; + points.SetCenter(static_cast(xsum.get()/(points.N)),ysum.get()/(points.N)); + points.SetXMin(static_cast(xmin.get())); points.SetYMin(static_cast(xmax.get())); + points.SetYMax(static_cast(ymax.get())); points.SetYMax(static_cast(ymax.get())); + m_points=points; } stopTimer(); diff --git a/src/basic/REDUCE_STRUCT-Seq.cpp b/src/basic/REDUCE_STRUCT-Seq.cpp index 7e91ca722..143c98ca2 100644 --- a/src/basic/REDUCE_STRUCT-Seq.cpp +++ b/src/basic/REDUCE_STRUCT-Seq.cpp @@ -42,10 +42,10 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune REDUCE_STRUCT_BODY; } - particles.SetCenter(xsum/(particles.N),ysum/(particles.N)); - particles.SetXMin(xmin); particles.SetXMax(xmax); - particles.SetYMin(ymin); particles.SetYMax(ymax); - m_particles=particles; + points.SetCenter(xsum/(points.N),ysum/(points.N)); + points.SetXMin(xmin); points.SetXMax(xmax); + points.SetYMin(ymin); points.SetYMax(ymax); + m_points=points; } stopTimer(); @@ -57,11 +57,11 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune case Lambda_Seq : { auto reduce_struct_x_base_lam = [=](Index_type i) -> Real_type { - return particles.x[i]; + return points.x[i]; }; auto reduce_struct_y_base_lam = [=](Index_type i) -> Real_type { - return particles.y[i]; + return points.y[i]; }; startTimer(); @@ -80,10 +80,10 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune ymax = RAJA_MAX(ymax, reduce_struct_y_base_lam(i)); } - particles.SetCenter(xsum/(particles.N),ysum/(particles.N)); - particles.SetXMin(xmin); particles.SetXMax(xmax); - particles.SetYMin(ymin); particles.SetYMax(ymax); - m_particles=particles; + points.SetCenter(xsum/(points.N),ysum/(points.N)); + points.SetXMin(xmin); points.SetXMax(xmax); + points.SetYMin(ymin); points.SetYMax(ymax); + m_points=points; } stopTimer(); @@ -108,10 +108,10 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune REDUCE_STRUCT_BODY_RAJA; }); - particles.SetCenter(static_cast(xsum.get()/(particles.N)),static_cast(ysum.get()/(particles.N))); - particles.SetXMin(static_cast(xmin.get())); particles.SetXMax(static_cast(xmax.get())); - particles.SetYMin(static_cast(ymin.get())); particles.SetYMax(static_cast(ymax.get())); - m_particles=particles; + points.SetCenter(static_cast(xsum.get()/(points.N)),static_cast(ysum.get()/(points.N))); + points.SetXMin(static_cast(xmin.get())); points.SetXMax(static_cast(xmax.get())); + points.SetYMin(static_cast(ymin.get())); points.SetYMax(static_cast(ymax.get())); + m_points=points; } stopTimer(); diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp index 65c669379..d5c33f906 100644 --- a/src/basic/REDUCE_STRUCT.cpp +++ b/src/basic/REDUCE_STRUCT.cpp @@ -79,12 +79,12 @@ void REDUCE_STRUCT::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) void REDUCE_STRUCT::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += m_particles.GetCenter()[0]; - checksum[vid][tune_idx] += m_particles.GetXMin(); - checksum[vid][tune_idx] += m_particles.GetXMax(); - checksum[vid][tune_idx] += m_particles.GetCenter()[1]; - checksum[vid][tune_idx] += m_particles.GetYMin(); - checksum[vid][tune_idx] += m_particles.GetYMax(); + checksum[vid][tune_idx] += m_points.GetCenter()[0]; + checksum[vid][tune_idx] += m_points.GetXMin(); + checksum[vid][tune_idx] += m_points.GetXMax(); + checksum[vid][tune_idx] += m_points.GetCenter()[1]; + checksum[vid][tune_idx] += m_points.GetYMin(); + checksum[vid][tune_idx] += m_points.GetYMax(); return; } diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index f1630123c..5efae03fd 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -20,14 +20,14 @@ /// ymin = RAJA_MIN(ymin, y[i]) ; ymax = RAJA_MAX(ymax, y[i]) ; /// } /// -/// particles.xcenter = xsum; -/// particles.xcenter /= particles.N -/// particles.xmin = xmin; -/// particles.xmax = xmax; -/// particles.ycenter = ysum; -/// particles.ycenter /= particles.N -/// particles.ymin = ymin; -/// particles.ymax = ymax; +/// points.xcenter = xsum; +/// points.xcenter /= points.N +/// points.xmin = xmin; +/// points.xmax = xmax; +/// points.ycenter = ysum; +/// points.ycenter /= points.N +/// points.ymin = ymin; +/// points.ymax = ymax; /// /// RAJA_MIN/MAX are macros that do what you would expect. @@ -38,26 +38,26 @@ #define REDUCE_STRUCT_DATA_SETUP \ - particles_t particles; \ - particles.N = getActualProblemSize(); \ - particles.x = m_x; \ - particles.y = m_y; \ + points points; \ + points.N = getActualProblemSize(); \ + points.x = m_x; \ + points.y = m_y; \ #define REDUCE_STRUCT_BODY \ - xsum += particles.x[i] ; \ - xmin = RAJA_MIN(xmin, particles.x[i]) ; \ - xmax = RAJA_MAX(xmax, particles.x[i]) ; \ - ysum += particles.y[i] ; \ - ymin = RAJA_MIN(ymin, particles.y[i]) ; \ - ymax = RAJA_MAX(ymax, particles.y[i]) ; + xsum += points.x[i] ; \ + xmin = RAJA_MIN(xmin, points.x[i]) ; \ + xmax = RAJA_MAX(xmax, points.x[i]) ; \ + ysum += points.y[i] ; \ + ymin = RAJA_MIN(ymin, points.y[i]) ; \ + ymax = RAJA_MAX(ymax, points.y[i]) ; #define REDUCE_STRUCT_BODY_RAJA \ - xsum += particles.x[i] ; \ - xmin.min(particles.x[i]) ; \ - xmax.max(particles.x[i]) ; \ - ysum += particles.y[i] ; \ - ymin.min(particles.y[i]) ; \ - ymax.max(particles.y[i]) ; + xsum += points.x[i] ; \ + xmin.min(points.x[i]) ; \ + xmax.max(points.x[i]) ; \ + ysum += points.y[i] ; \ + ymin.min(points.y[i]) ; \ + ymax.max(points.y[i]) ; #include "common/KernelBase.hpp" @@ -94,7 +94,7 @@ class REDUCE_STRUCT : public KernelBase template < size_t block_size > void runHipVariantImpl(VariantID vid); - struct particles_t{ + struct points{ Int_type N; Real_ptr x, y; @@ -122,7 +122,7 @@ class REDUCE_STRUCT : public KernelBase Real_type m_init_sum; Real_type m_init_min; Real_type m_init_max; - particles_t m_particles; + points m_points; Real_type X_MIN = 0.0, X_MAX = 100.0; Real_type Y_MIN = 0.0, Y_MAX = 50.0; Real_type Lx = (X_MAX) - (X_MIN); From 5476119fcd87ef283162945464eec11168e9e8a2 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 15 Apr 2022 11:09:29 -0700 Subject: [PATCH 342/392] Fix reduction initialization, formatting, code correctness, etc. --- src/basic/REDUCE_STRUCT-Cuda.cpp | 15 +++++--- src/basic/REDUCE_STRUCT-Hip.cpp | 15 +++++--- src/basic/REDUCE_STRUCT-OMP.cpp | 49 ++++++++++++++----------- src/basic/REDUCE_STRUCT-OMPTarget.cpp | 35 ++++++++++-------- src/basic/REDUCE_STRUCT-Seq.cpp | 51 +++++++++++++++------------ 5 files changed, 99 insertions(+), 66 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 6da22a2ef..e6a4b040f 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -36,6 +36,9 @@ __launch_bounds__(block_size) __global__ void reduce_struct(Real_ptr x, Real_ptr y, Real_ptr xsum, Real_ptr xmin, Real_ptr xmax, Real_ptr ysum, Real_ptr ymin, Real_ptr ymax, + Real_type m_init_sum, + Real_type m_init_min, + Real_type m_init_max, Index_type iend) { @@ -128,6 +131,7 @@ void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid) points.x, points.y, mem, mem+1, mem+2, // xcenter,xmin,xmax mem+3, mem+4, mem+5, // ycenter,ymin,ymax + m_init_sum, m_init_min, m_init_max points.N); cudaErrchk( cudaGetLastError() ); @@ -135,7 +139,7 @@ void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid) Real_ptr plmem = &lmem[0]; getCudaDeviceData(plmem, mem, 6); - points.SetCenter(lmem[0]/points.N,lmem[3]/points.N); + points.SetCenter(lmem[0]/points.N, lmem[3]/points.N); points.SetXMin(lmem[1]); points.SetXMax(lmem[2]); points.SetYMin(lmem[4]); @@ -168,9 +172,12 @@ void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid) REDUCE_STRUCT_BODY_RAJA; }); - points.SetCenter(static_cast(xsum.get()/(points.N)),ysum.get()/(points.N)); - points.SetXMin(static_cast(xmin.get())); points.SetXMax(static_cast(xmax.get())); - points.SetYMin(static_cast(ymin.get())); points.SetYMax(static_cast(ymax.get())); + points.SetCenter(static_cast(xsum.get()/(points.N)), + static_cast(ysum.get()/(points.N))); + points.SetXMin(static_cast(xmin.get())); + points.SetXMax(static_cast(xmax.get())); + points.SetYMin(static_cast(ymin.get())); + points.SetYMax(static_cast(ymax.get())); m_points=points; } diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index e5b80f0bc..a2c8aa733 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -35,6 +35,9 @@ __launch_bounds__(block_size) __global__ void reduce_struct(Real_ptr x, Real_ptr y, Real_ptr xsum, Real_ptr xmin, Real_ptr xmax, Real_ptr ysum, Real_ptr ymin, Real_ptr ymax, + Real_type m_init_sum, + Real_type m_init_min, + Real_type m_init_max, Index_type iend) { @@ -130,6 +133,7 @@ void REDUCE_STRUCT::runHipVariantImpl(VariantID vid) points.x, points.y, mem, mem+1, mem+2, // xcenter,xmin,xmax mem+3, mem+4, mem+5, // ycenter,ymin,ymax + m_init_sum, m_init_min, m_init_max, points.N); hipErrchk( hipGetLastError() ); @@ -137,7 +141,7 @@ void REDUCE_STRUCT::runHipVariantImpl(VariantID vid) Real_ptr plmem = &lmem[0]; getHipDeviceData(plmem, mem, 6); - points.SetCenter(lmem[0]/points.N,lmem[3]/points.N); + points.SetCenter(lmem[0]/points.N, lmem[3]/points.N); points.SetXMin(lmem[1]); points.SetXMax(lmem[2]); points.SetYMin(lmem[4]); @@ -170,9 +174,12 @@ void REDUCE_STRUCT::runHipVariantImpl(VariantID vid) REDUCE_STRUCT_BODY_RAJA; }); - points.SetCenter(static_cast(xsum.get()/(points.N)),ysum.get()/(points.N)); - points.SetXMin(static_cast(xmin.get())); points.SetXMax(static_cast(xmax.get())); - points.SetYMin(static_cast(ymin.get())); points.SetYMax(static_cast(ymax.get())); + points.SetCenter(static_cast(xsum.get()/(points.N)), + static_cast(ysum.get()/(points.N))); + points.SetXMin(static_cast(xmin.get())); + points.SetXMax(static_cast(xmax.get())); + points.SetYMin(static_cast(ymin.get())); + points.SetYMax(static_cast(ymax.get())); m_points=points; } diff --git a/src/basic/REDUCE_STRUCT-OMP.cpp b/src/basic/REDUCE_STRUCT-OMP.cpp index 960aab266..4f97b4bd2 100644 --- a/src/basic/REDUCE_STRUCT-OMP.cpp +++ b/src/basic/REDUCE_STRUCT-OMP.cpp @@ -36,9 +36,9 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Real_type xsum = 0.0; Real_type ysum = 0.0; - Real_type xmin = 0.0; Real_type ymin = 0.0; - Real_type xmax = 0.0; Real_type ymax = 0.0; + Real_type xsum = m_init_sum; Real_type ysum = m_init_sum; + Real_type xmin = m_init_min; Real_type ymin = m_init_min; + Real_type xmax = m_init_max; Real_type ymax = m_init_max; #pragma omp parallel for reduction(+:xsum), \ reduction(min:xmin), \ @@ -50,9 +50,11 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t REDUCE_STRUCT_BODY; } - points.SetCenter(xsum/points.N,ysum/points.N); - points.SetXMin(xmin); points.SetXMax(xmax); - points.SetYMin(ymin); points.SetYMax(ymax); + points.SetCenter(xsum/points.N, ysum/points.N); + points.SetXMin(xmin); + points.SetXMax(xmax); + points.SetYMin(ymin); + points.SetYMax(ymax); m_points=points; } @@ -74,9 +76,9 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Real_type xsum = 0.0, ysum = 0.0; - Real_type xmin = 0.0, ymin = 0.0; - Real_type xmax = 0.0, ymax = 0.0; + Real_type xsum = m_init_sum; Real_type ysum = m_init_sum; + Real_type xmin = m_init_min; Real_type ymin = m_init_min; + Real_type xmax = m_init_max; Real_type ymax = m_init_max; #pragma omp parallel for reduction(+:xsum), \ reduction(min:xmin), \ @@ -93,9 +95,11 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t ymax = RAJA_MAX(ymax, reduce_struct_y_base_lam(i)); } - points.SetCenter(xsum/points.N,ysum/points.N); - points.SetXMin(xmin); points.SetXMax(xmax); - points.SetYMin(ymin); points.SetYMax(ymax); + points.SetCenter(xsum/points.N, ysum/points.N); + points.SetXMin(xmin); + points.SetXMax(xmax); + points.SetYMin(ymin); + points.SetYMax(ymax); m_points=points; } @@ -109,21 +113,24 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum xsum(0.0); - RAJA::ReduceSum ysum(0.0); - RAJA::ReduceMin xmin(0.0); - RAJA::ReduceMin ymin(0.0); - RAJA::ReduceMax xmax(0.0); - RAJA::ReduceMax ymax(0.0); + RAJA::ReduceSum xsum(m_init_sum); + RAJA::ReduceSum ysum(m_init_sum); + RAJA::ReduceMin xmin(m_init_min); + RAJA::ReduceMin ymin(m_init_min); + RAJA::ReduceMax xmax(m_init_max); + RAJA::ReduceMax ymax(m_init_max); RAJA::forall( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { REDUCE_STRUCT_BODY_RAJA; }); - points.SetCenter(static_cast(xsum.get()/(points.N)),ysum.get()/(points.N)); - points.SetXMin(static_cast(xmin.get())); points.SetYMin(static_cast(xmax.get())); - points.SetYMax(static_cast(ymax.get())); points.SetYMax(static_cast(ymax.get())); + points.SetCenter(static_cast(xsum.get()/(points.N)), + static_cast(ysum.get()/(points.N))); + points.SetXMin(static_cast(xmin.get())); + points.SetXMax(static_cast(xmax.get())); + points.SetYMin(static_cast(ymin.get())); + points.SetYMax(static_cast(ymax.get())); m_points=points; } diff --git a/src/basic/REDUCE_STRUCT-OMPTarget.cpp b/src/basic/REDUCE_STRUCT-OMPTarget.cpp index 6445b8866..da27b4c14 100644 --- a/src/basic/REDUCE_STRUCT-OMPTarget.cpp +++ b/src/basic/REDUCE_STRUCT-OMPTarget.cpp @@ -53,9 +53,9 @@ void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Real_type xsum = 0.0, ysum = 0.0; - Real_type xmin = 0.0, ymin = 0.0; - Real_type xmax = 0.0, ymax = 0.0; + Real_type xsum = m_init_sum; Real_type ysum = m_init_sum; + Real_type xmin = m_init_min; Real_type ymin = m_init_min; + Real_type xmax = m_init_max; Real_type ymax = m_init_max; #pragma omp target is_device_ptr(vec) device( did ) map(tofrom:xsum, xmin, xmax, ysum, ymin, ymax) #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static,1) \ @@ -69,9 +69,11 @@ void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid) REDUCE_STRUCT_BODY; } - points.SetCenter(xsum/points.N,ysum/points.N); - points.SetXMin(xmin); points.SetXMax(xmax); - points.SetYMin(ymin); points.SetYMax(ymax); + points.SetCenter(xsum/points.N, ysum/points.N); + points.SetXMin(xmin); + points.SetXMax(xmax); + points.SetYMin(ymin); + points.SetYMax(ymax); m_points=points; } @@ -86,12 +88,12 @@ void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum xsum(0.0); - RAJA::ReduceSum ysum(0.0); - RAJA::ReduceMin xmin(0.0); - RAJA::ReduceMin ymin(0.0); - RAJA::ReduceMax xmax(0.0); - RAJA::ReduceMax ymax(0.0); + RAJA::ReduceSum xsum(m_init_sum); + RAJA::ReduceSum ysum(m_init_sum); + RAJA::ReduceMin xmin(m_init_min); + RAJA::ReduceMin ymin(m_init_min); + RAJA::ReduceMax xmax(m_init_max); + RAJA::ReduceMax ymax(m_init_max); RAJA::forall>( RAJA::RangeSegment(ibegin, iend), @@ -99,9 +101,12 @@ void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid) REDUCE_STRUCT_BODY_RAJA; }); - points.SetCenter(static_cast(xsum.get()/(points.N)),ysum.get()/(points.N)); - points.SetXMin(static_cast(xmin.get())); points.SetYMin(static_cast(xmax.get())); - points.SetYMax(static_cast(ymax.get())); points.SetYMax(static_cast(ymax.get())); + points.SetCenter(static_cast(xsum.get()/(points.N)), + static_cast(ysum.get()/(points.N))); + points.SetXMin(static_cast(xmin.get())); + points.SetXMax(static_cast(xmax.get())); + points.SetYMin(static_cast(ymin.get())); + points.SetYMax(static_cast(ymax.get())); m_points=points; } diff --git a/src/basic/REDUCE_STRUCT-Seq.cpp b/src/basic/REDUCE_STRUCT-Seq.cpp index 143c98ca2..36df2b610 100644 --- a/src/basic/REDUCE_STRUCT-Seq.cpp +++ b/src/basic/REDUCE_STRUCT-Seq.cpp @@ -33,18 +33,20 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - Real_type xsum = 0.0; Real_type ysum = 0.0; - Real_type xmin = 0.0; Real_type ymin = 0.0; - Real_type xmax = 0.0; Real_type ymax = 0.0; + + Real_type xsum = m_init_sum; Real_type ysum = m_init_sum; + Real_type xmin = m_init_min; Real_type ymin = m_init_min; + Real_type xmax = m_init_max; Real_type ymax = m_init_max; for (Index_type i = ibegin; i < iend; ++i ) { REDUCE_STRUCT_BODY; } - points.SetCenter(xsum/(points.N),ysum/(points.N)); - points.SetXMin(xmin); points.SetXMax(xmax); - points.SetYMin(ymin); points.SetYMax(ymax); + points.SetCenter(xsum/(points.N), ysum/(points.N)); + points.SetXMin(xmin); + points.SetXMax(xmax); + points.SetYMin(ymin); + points.SetYMax(ymax); m_points=points; } @@ -67,9 +69,9 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Real_type xsum = 0.0; Real_type ysum = 0.0; - Real_type xmin = 0.0; Real_type ymin = 0.0; - Real_type xmax = 0.0; Real_type ymax = 0.0; + Real_type xsum = m_init_sum; Real_type ysum = m_init_sum; + Real_type xmin = m_init_min; Real_type ymin = m_init_min; + Real_type xmax = m_init_max; Real_type ymax = m_init_max; for (Index_type i = ibegin; i < iend; ++i ) { xsum += reduce_struct_x_base_lam(i); @@ -80,9 +82,11 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune ymax = RAJA_MAX(ymax, reduce_struct_y_base_lam(i)); } - points.SetCenter(xsum/(points.N),ysum/(points.N)); - points.SetXMin(xmin); points.SetXMax(xmax); - points.SetYMin(ymin); points.SetYMax(ymax); + points.SetCenter(xsum/(points.N), ysum/(points.N)); + points.SetXMin(xmin); + points.SetXMax(xmax); + points.SetYMin(ymin); + points.SetYMax(ymax); m_points=points; } @@ -96,21 +100,24 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum xsum(0.0); - RAJA::ReduceSum ysum(0.0); - RAJA::ReduceMin xmin(0.0); - RAJA::ReduceMin ymin(0.0); - RAJA::ReduceMax xmax(0.0); - RAJA::ReduceMax ymax(0.0); + RAJA::ReduceSum xsum(m_init_sum); + RAJA::ReduceSum ysum(m_init_sum); + RAJA::ReduceMin xmin(m_init_min); + RAJA::ReduceMin ymin(m_init_min); + RAJA::ReduceMax xmax(m_init_max); + RAJA::ReduceMax ymax(m_init_max); RAJA::forall( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { REDUCE_STRUCT_BODY_RAJA; }); - points.SetCenter(static_cast(xsum.get()/(points.N)),static_cast(ysum.get()/(points.N))); - points.SetXMin(static_cast(xmin.get())); points.SetXMax(static_cast(xmax.get())); - points.SetYMin(static_cast(ymin.get())); points.SetYMax(static_cast(ymax.get())); + points.SetCenter(static_cast(xsum.get()/(points.N)), + static_cast(ysum.get()/(points.N))); + points.SetXMin(static_cast(xmin.get())); + points.SetXMax(static_cast(xmax.get())); + points.SetYMin(static_cast(ymin.get())); + points.SetYMax(static_cast(ymax.get())); m_points=points; } From d131381f50a0d82813ff5cd0eb20889a076e7289 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 15 Apr 2022 11:40:17 -0700 Subject: [PATCH 343/392] Add missing comma. --- src/basic/REDUCE_STRUCT-Cuda.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index e6a4b040f..32bd39234 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -131,7 +131,7 @@ void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid) points.x, points.y, mem, mem+1, mem+2, // xcenter,xmin,xmax mem+3, mem+4, mem+5, // ycenter,ymin,ymax - m_init_sum, m_init_min, m_init_max + m_init_sum, m_init_min, m_init_max, points.N); cudaErrchk( cudaGetLastError() ); From 14986f4fc60429cea2b81687a4b8af10f6f8e85b Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Fri, 15 Apr 2022 13:40:46 -0500 Subject: [PATCH 344/392] fixing missing , --- src/basic/REDUCE_STRUCT-Cuda.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index e6a4b040f..32bd39234 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -131,7 +131,7 @@ void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid) points.x, points.y, mem, mem+1, mem+2, // xcenter,xmin,xmax mem+3, mem+4, mem+5, // ycenter,ymin,ymax - m_init_sum, m_init_min, m_init_max + m_init_sum, m_init_min, m_init_max, points.N); cudaErrchk( cudaGetLastError() ); From 47842c690d2b72fc6d4afc2ab16f805e965c2fb9 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 15 Apr 2022 15:05:13 -0700 Subject: [PATCH 345/392] Added basic suite test and enabled in azure --- CMakeLists.txt | 31 +++++++-- Dockerfile | 16 ++--- src/common/Executor.hpp | 6 ++ test/CMakeLists.txt | 26 ++++++++ test/test-raja-perf-suite.cpp | 115 ++++++++++++++++++++++++++++++++++ 5 files changed, 180 insertions(+), 14 deletions(-) create mode 100644 test/CMakeLists.txt create mode 100644 test/test-raja-perf-suite.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index ef42271fe..fe768a8e5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,9 +6,10 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -project(RAJAPerfSuite CXX) +# C is required for googletest to find Threads +project(RAJAPerfSuite LANGUAGES CXX C) -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.14.5) option(ENABLE_RAJA_SEQUENTIAL "Run sequential variants of RAJA kernels. Disable this, and all other variants, to run _only_ raw C loops." On) @@ -21,10 +22,6 @@ if (PERFSUITE_ENABLE_WARNINGS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror") endif() -set(ENABLE_TESTS Off CACHE BOOL "Enable BLT and RAJA tests") -set(ENABLE_EXAMPLES Off CACHE BOOL "Enable RAJA examples") -set(RAJA_ENABLE_EXERCISES Off CACHE BOOL "Enable RAJA exercises") - set(CMAKE_CXX_STANDARD 14) set(BLT_CXX_STD c++14) @@ -34,7 +31,24 @@ include(blt/SetupBLT.cmake) # Define RAJA PERFSUITE settings... # +#cmake_dependent_option(RAJA_PERFSUITE_ENABLE_TESTS "Enable RAJA Perf Suite Tests" On "ENABLE_TESTS" Off) + +if (RAJA_PERFSUITE_ENABLE_TESTS) + + set(ENABLE_TESTS On CACHE BOOL "Enable testing for RAJA Perf Suite") + set(RAJA_ENABLE_TESTS Off CACHE BOOL "") + set(CAMP_ENABLE_TESTS Off CACHE BOOL "") + set(BLT_ENABLE_TESTS Off CACHE BOOL "") + +else() + +set(ENABLE_TESTS Off CACHE BOOL "Enable BLT, camp, and RAJA tests") +set(ENABLE_EXAMPLES Off CACHE BOOL "Enable RAJA examples") + +endif() + cmake_dependent_option(RAJA_PERFSUITE_ENABLE_MPI "Build with MPI" On "ENABLE_MPI" Off) + cmake_dependent_option(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN "Build OpenMP scan variants" Off "ENABLE_OPENMP" Off) # @@ -43,6 +57,7 @@ cmake_dependent_option(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN "Build OpenMP scan var set(RAJA_ENABLE_TESTS Off CACHE BOOL "") set(RAJA_ENABLE_EXAMPLES Off CACHE BOOL "") +set(RAJA_ENABLE_EXERCISES Off CACHE BOOL "") set(ENABLE_DOCUMENTATION Off CACHE BOOL "") set(ENABLE_TBB Off CACHE BOOL "") @@ -135,3 +150,7 @@ set (CUDA_NVCC_FLAGS ${RAJA_NVCC_FLAGS}) # Each directory in the perf suite has its own CMakeLists.txt file. # add_subdirectory(src) + +if (RAJA_PERFSUITE_ENABLE_TESTS) + add_subdirectory(test) +endif() diff --git a/Dockerfile b/Dockerfile index 8aa08d31d..f1f01791c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -############################################################################### +############################################################################## # Copyright (c) 2016-22, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJA/LICENSE file for details. # @@ -11,7 +11,7 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DENABLE_OPENMP=On .. && \ make -j 6 &&\ - ./bin/raja-perf.exe --checkrun 5 -sp + ctest -T test --output-on-failure FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-8.1.0 AS gcc8 ENV GTEST_COLOR=1 @@ -19,7 +19,7 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_OPENMP=On .. && \ make -j 6 &&\ - ./bin/raja-perf.exe --checkrun 5 -sp + ctest -T test --output-on-failure FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-9.4.0 AS gcc9 ENV GTEST_COLOR=1 @@ -27,7 +27,7 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DENABLE_OPENMP=On .. && \ make -j 6 &&\ - ./bin/raja-perf.exe --checkrun 5 -sp + ctest -T test --output-on-failure FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-11.2.0 AS gcc11 ENV GTEST_COLOR=1 @@ -35,7 +35,7 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DENABLE_OPENMP=On .. && \ make -j 6 &&\ - ./bin/raja-perf.exe --checkrun 5 -sp + ctest -T test --output-on-failure FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11 ENV GTEST_COLOR=1 @@ -43,7 +43,7 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=clang++ .. && \ make -j 6 &&\ - ./bin/raja-perf.exe --checkrun 5 -sp + ctest -T test --output-on-failure FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11-debug ENV GTEST_COLOR=1 @@ -51,7 +51,7 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug .. && \ make -j 6 &&\ - ./bin/raja-perf.exe --checkrun -sp + ctest -T test --output-on-failure FROM ghcr.io/rse-ops/clang-ubuntu-22.04:llvm-13.0.0 AS clang13 ENV GTEST_COLOR=1 @@ -59,7 +59,7 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release .. && \ make -j 6 &&\ - ./bin/raja-perf.exe --checkrun 5 -sp + ctest -T test --output-on-failure FROM ghcr.io/rse-ops/cuda:cuda-10.1.243-ubuntu-18.04 AS nvcc10 ENV GTEST_COLOR=1 diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp index 8a3fe11e0..a4403f1eb 100644 --- a/src/common/Executor.hpp +++ b/src/common/Executor.hpp @@ -89,6 +89,12 @@ class Executor VariantID reference_vid; size_t reference_tune_idx; + +public: + // Methods for verification testing in CI. + std::vector getKernels() const { return kernels; } + std::vector getVariantIDs() const { return variant_ids; } + }; } // closing brace for rajaperf namespace diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt new file mode 100644 index 000000000..94284e28d --- /dev/null +++ b/test/CMakeLists.txt @@ -0,0 +1,26 @@ +############################################################################### +# Copyright (c) 2017-22, Lawrence Livermore National Security, LLC +# and RAJA Performance Suite project contributors. +# See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +include_directories(../src) + +set(RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS + common + apps + basic + lcals + polybench + stream + algorithm) +list(APPEND RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) + +raja_add_test( + NAME test-raja-perf-suite + SOURCES test-raja-perf-suite.cpp + DEPENDS_ON ${RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS} + ) + diff --git a/test/test-raja-perf-suite.cpp b/test/test-raja-perf-suite.cpp new file mode 100644 index 000000000..693f7ef64 --- /dev/null +++ b/test/test-raja-perf-suite.cpp @@ -0,0 +1,115 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "gtest/gtest.h" + +#include "common/Executor.hpp" +#include "common/KernelBase.hpp" + +#include +#include +#include +#include + +TEST(ShortSuiteTest, Basic) +{ + // Assemble command line args for basic test + constexpr int nargs = 4; + + int argc = nargs; + + std::vector< std::string > sargv(nargs); + sargv[0] = std::string("dummy"); // for executable name + sargv[1] = std::string("--checkrun"); + sargv[2] = std::string("5"); + sargv[3] = std::string("-sp"); + + char** argv = new char* [nargs]; + for (int is = 0; is < nargs; ++is) { + argv[is] = const_cast(sargv[is].c_str()); + } + + // STEP 1: Create suite executor object with input args defined above + rajaperf::Executor executor(argc, argv); + + // STEP 2: Assemble kernels and variants to run + executor.setupSuite(); + + // STEP 3: Report suite run summary + executor.reportRunSummary(std::cout); + + // STEP 4: Execute suite + executor.runSuite(); + + // STEP 5: Access suite run data and run through checks + std::vector kernels = executor.getKernels(); + std::vector variant_ids = executor.getVariantIDs(); + + + for (size_t ik = 0; ik < kernels.size(); ++ik) { + + rajaperf::KernelBase* kernel = kernels[ik]; + + // + // Get reference checksum (first kernel variant run) + // + rajaperf::Checksum_type cksum_ref = 0.0; + size_t ivck = 0; + bool found_ref = false; + while ( ivck < variant_ids.size() && !found_ref ) { + + rajaperf::VariantID vid = variant_ids[ivck]; + size_t num_tunings = kernel->getNumVariantTunings(vid); + for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) { + if ( kernel->wasVariantTuningRun(vid, tune_idx) ) { + cksum_ref = kernel->getChecksum(vid, tune_idx); + found_ref = true; + break; + } + } + ++ivck; + + } // while loop over variants until reference checksum found + + + // + // Check execution time is greater than zero and checksum diff is + // within tolerance for each variant run. + // + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + + rajaperf::VariantID vid = variant_ids[iv]; + + size_t num_tunings = kernel->getNumVariantTunings(variant_ids[iv]); + for (size_t tune_idx = 0; tune_idx < num_tunings; ++tune_idx) { + if ( kernel->wasVariantTuningRun(vid, tune_idx) ) { + + double rtime = kernel->getTotTime(vid, tune_idx); + + rajaperf::Checksum_type cksum = kernel->getChecksum(vid, tune_idx); + rajaperf::Checksum_type cksum_diff = std::abs(cksum_ref - cksum); + + // Print kernel information when running test manually + std::cout << "Check kernel, variant, tuning : " + << kernel->getName() << " , " + << rajaperf::getVariantName(vid) << " , " + << kernel->getVariantTuningName(vid, tune_idx) + << std::endl; + EXPECT_GT(rtime, 0.0); + EXPECT_LT(cksum_diff, 1e-5); + + } + } + + } // loop over variants + + } // loop over kernels + + // clean up + delete [] argv; +} From 3036da15211897f5aad05871cbc93b8dceb04d02 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 15 Apr 2022 15:21:57 -0700 Subject: [PATCH 346/392] enable azure tests for real this time... --- Dockerfile | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index f1f01791c..833f35fec 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,7 +9,7 @@ FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-7.3.0 AS gcc7 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DENABLE_OPENMP=On .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DENABLE_OPENMP=On -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ make -j 6 &&\ ctest -T test --output-on-failure @@ -17,7 +17,7 @@ FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-8.1.0 AS gcc8 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_OPENMP=On .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_OPENMP=On -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ make -j 6 &&\ ctest -T test --output-on-failure @@ -25,7 +25,7 @@ FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-9.4.0 AS gcc9 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DENABLE_OPENMP=On .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DENABLE_OPENMP=On -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ make -j 6 &&\ ctest -T test --output-on-failure @@ -33,7 +33,7 @@ FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-11.2.0 AS gcc11 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DENABLE_OPENMP=On .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DENABLE_OPENMP=On -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ make -j 6 &&\ ctest -T test --output-on-failure @@ -41,7 +41,7 @@ FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=clang++ .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ make -j 6 &&\ ctest -T test --output-on-failure @@ -49,7 +49,7 @@ FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11-debug ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ make -j 6 &&\ ctest -T test --output-on-failure @@ -57,7 +57,7 @@ FROM ghcr.io/rse-ops/clang-ubuntu-22.04:llvm-13.0.0 AS clang13 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ make -j 6 &&\ ctest -T test --output-on-failure From 9a3d184e7f4544e96b094423be2cb9076d60b470 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 19 Apr 2022 09:45:19 -0700 Subject: [PATCH 347/392] enable test on gitlab and attempt to fix azure clang link issue --- scripts/gitlab/build_and_test.sh | 46 +++++++++++++++++++++++--------- test/CMakeLists.txt | 2 +- 2 files changed, 35 insertions(+), 13 deletions(-) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 722297930..fbd8df2aa 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -164,10 +164,6 @@ then date fi -echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -echo "~~~~~ RUNNING RAJAPERF SUITE" -echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - if [[ ! -d ${build_dir} ]] then echo "ERROR: Build directory not found : ${build_dir}" && exit 1 @@ -175,35 +171,61 @@ fi cd ${build_dir} +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ TESTING RAJAPERF SUITE" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + if grep -q -i "ENABLE_TESTS.*ON" ${hostconfig_path} then + + # + # Maintaining separate, but identical release and debug sections + # in case we want to make them disctinct in the future. + # + if echo ${sys_type} | grep -q "blueos" && echo ${spec} | grep -q "cuda" ; then if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} then - lrun -n1 --smpiargs="-disable_gpu_hooks" ./bin/raja-perf.exe -sp echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" - echo "lrun -n1 --smpiargs='-disable_gpu_hooks' /bin/raja-perf.exe -sp" + echo "lrun -n1 --smpiargs='-disable_gpu_hooks' ctest --output-on-failure -T test 2>&1 | tee tests_output.txt" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + lrun -n1 --smpiargs="-disable_gpu_hooks" ctest --output-on-failure -T test 2>&1 | tee tests_output.txt else - lrun -n1 --smpiargs="-disable_gpu_hooks" ./bin/raja-perf.exe --checkrun -sp echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" - echo "lrun -n1 --smpiargs='-disable_gpu_hook' ./bin/raja-perf.exe --checkrun -sp" + echo "lrun -n1 --smpiargs='-disable_gpu_hooks' ctest --output-on-failure -T test 2>&1 | tee tests_output.txt" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + lrun -n1 --smpiargs="-disable_gpu_hooks" ctest --output-on-failure -T test 2>&1 | tee tests_output.txt fi else if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} then - ./bin/raja-perf.exe -sp echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" - echo "./bin/raja-perf.exe -sp" + echo "ctest --output-on-failure -T test 2>&1 | tee tests_output.txt" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + ctest --output-on-failure -T test 2>&1 | tee tests_output.txt else - ./bin/raja-perf.exe --checkrun 10 -sp echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" - echo "./bin/raja-perf.exe --checkrun 10 -sp" + echo "ctest --output-on-failure -T test 2>&1 | tee tests_output.txt" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + ctest --output-on-failure -T test 2>&1 | tee tests_output.txt fi fi + + no_test_str="No tests were found!!!" + if [[ "$(tail -n 1 tests_output.txt)" == "${no_test_str}" ]] + then + echo "ERROR: No tests were found" && exit 1 + fi + + echo "Copying Testing xml reports for export" + tree Testing + xsltproc -o junit.xml ${project_dir}/blt/tests/ctest-to-junit.xsl Testing/*/Test.xml + mv junit.xml ${project_dir}/junit.xml + + if grep -q "Errors while running CTest" ./tests_output.txt + then + echo "ERROR: failure(s) while running CTest" && exit 1 + fi fi echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 94284e28d..84306a20e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -16,7 +16,7 @@ set(RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS polybench stream algorithm) -list(APPEND RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) +#list(APPEND RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) raja_add_test( NAME test-raja-perf-suite From 2883cb653d59df75b376596bf73bc0d64251b3fa Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Tue, 19 Apr 2022 13:11:06 -0400 Subject: [PATCH 348/392] Update src/basic/REDUCE_STRUCT-OMP.cpp Co-authored-by: Jason Burmark --- src/basic/REDUCE_STRUCT-OMP.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-OMP.cpp b/src/basic/REDUCE_STRUCT-OMP.cpp index 4f97b4bd2..057bcad29 100644 --- a/src/basic/REDUCE_STRUCT-OMP.cpp +++ b/src/basic/REDUCE_STRUCT-OMP.cpp @@ -127,9 +127,9 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t points.SetCenter(static_cast(xsum.get()/(points.N)), static_cast(ysum.get()/(points.N))); - points.SetXMin(static_cast(xmin.get())); + points.SetXMin(static_cast(xmin.get())); points.SetXMax(static_cast(xmax.get())); - points.SetYMin(static_cast(ymin.get())); + points.SetYMin(static_cast(ymin.get())); points.SetYMax(static_cast(ymax.get())); m_points=points; From cc1e5941e70381bded9564c623f1e83e4666e36d Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Tue, 19 Apr 2022 13:11:12 -0400 Subject: [PATCH 349/392] Update src/basic/REDUCE_STRUCT-Seq.cpp Co-authored-by: Jason Burmark --- src/basic/REDUCE_STRUCT-Seq.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Seq.cpp b/src/basic/REDUCE_STRUCT-Seq.cpp index 36df2b610..a3bd18b84 100644 --- a/src/basic/REDUCE_STRUCT-Seq.cpp +++ b/src/basic/REDUCE_STRUCT-Seq.cpp @@ -114,9 +114,9 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune points.SetCenter(static_cast(xsum.get()/(points.N)), static_cast(ysum.get()/(points.N))); - points.SetXMin(static_cast(xmin.get())); + points.SetXMin(static_cast(xmin.get())); points.SetXMax(static_cast(xmax.get())); - points.SetYMin(static_cast(ymin.get())); + points.SetYMin(static_cast(ymin.get())); points.SetYMax(static_cast(ymax.get())); m_points=points; From 5130116e028965818681c18929efcbf49386b1fe Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Tue, 19 Apr 2022 13:14:04 -0400 Subject: [PATCH 350/392] removing redundant static_cast in REDUCE_STRUCT cuda variant --- src/basic/REDUCE_STRUCT-Cuda.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 32bd39234..52bec116d 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -172,12 +172,12 @@ void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid) REDUCE_STRUCT_BODY_RAJA; }); - points.SetCenter(static_cast(xsum.get()/(points.N)), - static_cast(ysum.get()/(points.N))); - points.SetXMin(static_cast(xmin.get())); - points.SetXMax(static_cast(xmax.get())); - points.SetYMin(static_cast(ymin.get())); - points.SetYMax(static_cast(ymax.get())); + points.SetCenter((xsum.get()/(points.N)), + (ysum.get()/(points.N))); + points.SetXMin((xmin.get())); + points.SetXMax((xmax.get())); + points.SetYMin((ymin.get())); + points.SetYMax((ymax.get())); m_points=points; } From 8bd8dcf1f0fc21d937855d7c27cf2ab3f62bf463 Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Tue, 19 Apr 2022 13:14:53 -0400 Subject: [PATCH 351/392] removing redundant static_cast in REDUCE_STRUCT hip variant --- src/basic/REDUCE_STRUCT-Hip.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index a2c8aa733..f72306107 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -174,12 +174,12 @@ void REDUCE_STRUCT::runHipVariantImpl(VariantID vid) REDUCE_STRUCT_BODY_RAJA; }); - points.SetCenter(static_cast(xsum.get()/(points.N)), - static_cast(ysum.get()/(points.N))); - points.SetXMin(static_cast(xmin.get())); - points.SetXMax(static_cast(xmax.get())); - points.SetYMin(static_cast(ymin.get())); - points.SetYMax(static_cast(ymax.get())); + points.SetCenter((xsum.get()/(points.N)), + (ysum.get()/(points.N))); + points.SetXMin((xmin.get())); + points.SetXMax((xmax.get())); + points.SetYMin((ymin.get())); + points.SetYMax((ymax.get())); m_points=points; } From 4de29710545952e2848c06647dbd0bc520c427e2 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 19 Apr 2022 10:18:39 -0700 Subject: [PATCH 352/392] Turn on openmp for azure clang builds --- Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 833f35fec..43f71466a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,7 +41,7 @@ FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DENABLE_OPENMP=On -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ make -j 6 &&\ ctest -T test --output-on-failure @@ -49,7 +49,7 @@ FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11-debug ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug -DENABLE_OPENMP=On -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ make -j 6 &&\ ctest -T test --output-on-failure @@ -57,7 +57,7 @@ FROM ghcr.io/rse-ops/clang-ubuntu-22.04:llvm-13.0.0 AS clang13 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release -DENABLE_OPENMP=On -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ make -j 6 &&\ ctest -T test --output-on-failure From 47ef38c4d623ae4bf629ab8d943e5edf6429ad0c Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 19 Apr 2022 10:56:45 -0700 Subject: [PATCH 353/392] Refine use of target dependencies based on suggestion --- test/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 84306a20e..a108b37ed 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -6,7 +6,8 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -include_directories(../src) +#include_directories(../src) +target_include_directories(test-raja-perf-suite PUBLIC ${CMAKE_SOURCE_DIR}/src) set(RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS common From 63ca09ef1b8c917b6abae2d5f5dcdef36144c84d Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 19 Apr 2022 11:06:18 -0700 Subject: [PATCH 354/392] Backout change to test target include directories --- test/CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a108b37ed..84306a20e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -6,8 +6,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -#include_directories(../src) -target_include_directories(test-raja-perf-suite PUBLIC ${CMAKE_SOURCE_DIR}/src) +include_directories(../src) set(RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS common From be97f5bb4257fdc9ca648ea332f83298bf2a43e9 Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Tue, 19 Apr 2022 14:28:44 -0400 Subject: [PATCH 355/392] removing redundant static_cast in REDUCE_STRUCT OMP variant --- src/basic/REDUCE_STRUCT-OMP.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-OMP.cpp b/src/basic/REDUCE_STRUCT-OMP.cpp index 057bcad29..ec60e6919 100644 --- a/src/basic/REDUCE_STRUCT-OMP.cpp +++ b/src/basic/REDUCE_STRUCT-OMP.cpp @@ -125,12 +125,12 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t REDUCE_STRUCT_BODY_RAJA; }); - points.SetCenter(static_cast(xsum.get()/(points.N)), - static_cast(ysum.get()/(points.N))); - points.SetXMin(static_cast(xmin.get())); - points.SetXMax(static_cast(xmax.get())); - points.SetYMin(static_cast(ymin.get())); - points.SetYMax(static_cast(ymax.get())); + points.SetCenter((xsum.get()/(points.N)), + (ysum.get()/(points.N))); + points.SetXMin((xmin.get())); + points.SetXMax((xmax.get())); + points.SetYMin((ymin.get())); + points.SetYMax((ymax.get())); m_points=points; } From 9106e12942cf021eb4eed365b42c89f9052eb3d0 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 19 Apr 2022 11:32:23 -0700 Subject: [PATCH 356/392] load llvm module for clang azure tests (needed for openmp) --- Dockerfile | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 43f71466a..2f2af7c00 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,7 +41,8 @@ FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DENABLE_OPENMP=On -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ +RUN . /opt/spack/share/spack/setup-env.sh && spack load llvm && \ + cmake -DCMAKE_CXX_COMPILER=clang++ -DENABLE_OPENMP=On -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ make -j 6 &&\ ctest -T test --output-on-failure @@ -49,7 +50,8 @@ FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11-debug ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug -DENABLE_OPENMP=On -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ +RUN . /opt/spack/share/spack/setup-env.sh && spack load llvm && \ + cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug -DENABLE_OPENMP=On -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ make -j 6 &&\ ctest -T test --output-on-failure @@ -57,7 +59,8 @@ FROM ghcr.io/rse-ops/clang-ubuntu-22.04:llvm-13.0.0 AS clang13 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release -DENABLE_OPENMP=On -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ +RUN . /opt/spack/share/spack/setup-env.sh && spack load llvm && \ + cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release -DENABLE_OPENMP=On -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ make -j 6 &&\ ctest -T test --output-on-failure From 7b08494e980119bc0604881a1ea14d8a06395784 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 19 Apr 2022 12:29:24 -0700 Subject: [PATCH 357/392] Remove the debug test, since it can take very long and isn't something we are that concerned about for testing. --- Dockerfile | 9 --------- 1 file changed, 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2f2af7c00..e163cab66 100644 --- a/Dockerfile +++ b/Dockerfile @@ -46,15 +46,6 @@ RUN . /opt/spack/share/spack/setup-env.sh && spack load llvm && \ make -j 6 &&\ ctest -T test --output-on-failure -FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11-debug -ENV GTEST_COLOR=1 -COPY . /home/raja/workspace -WORKDIR /home/raja/workspace/build -RUN . /opt/spack/share/spack/setup-env.sh && spack load llvm && \ - cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug -DENABLE_OPENMP=On -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ - make -j 6 &&\ - ctest -T test --output-on-failure - FROM ghcr.io/rse-ops/clang-ubuntu-22.04:llvm-13.0.0 AS clang13 ENV GTEST_COLOR=1 COPY . /home/raja/workspace From 4713db293f7399e0ffbfd3cf7cc076782ac09831 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 19 Apr 2022 12:52:51 -0700 Subject: [PATCH 358/392] Fix issues with which checks to build and run --- Dockerfile | 8 ++++++++ azure-pipelines.yml | 8 ++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index e163cab66..35f08e0a2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -46,6 +46,14 @@ RUN . /opt/spack/share/spack/setup-env.sh && spack load llvm && \ make -j 6 &&\ ctest -T test --output-on-failure +FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11-debug +ENV GTEST_COLOR=1 +COPY . /home/raja/workspace +WORKDIR /home/raja/workspace/build +RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug .. && \ + make -j 6 &&\ + ctest -T test --output-on-failure + FROM ghcr.io/rse-ops/clang-ubuntu-22.04:llvm-13.0.0 AS clang13 ENV GTEST_COLOR=1 COPY . /home/raja/workspace diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 027b6264a..cfd56b94e 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -51,16 +51,16 @@ jobs: docker_target: gcc11 clang11: docker_target: clang11 - clang11-debug: - docker_target: clang11-debug +## clang11-debug: +## docker_target: clang11-debug clang13: docker_target: clang13 nvcc10: docker_target: nvcc10 nvcc11: docker_target: nvcc11 - nvcc11-debug: - docker_target: nvcc11-debug +## nvcc11-debug: +## docker_target: nvcc11-debug hip: docker_target: hip # sycl: From efc4fce7d17a6b66f0c8d70cef8f9efc3a6400c4 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 19 Apr 2022 13:04:35 -0700 Subject: [PATCH 359/392] Remove unnecessary static casts --- src/basic/REDUCE_STRUCT-OMPTarget.cpp | 12 ++++++------ src/basic/REDUCE_STRUCT-Seq.cpp | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-OMPTarget.cpp b/src/basic/REDUCE_STRUCT-OMPTarget.cpp index da27b4c14..baa2b67d2 100644 --- a/src/basic/REDUCE_STRUCT-OMPTarget.cpp +++ b/src/basic/REDUCE_STRUCT-OMPTarget.cpp @@ -101,12 +101,12 @@ void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid) REDUCE_STRUCT_BODY_RAJA; }); - points.SetCenter(static_cast(xsum.get()/(points.N)), - static_cast(ysum.get()/(points.N))); - points.SetXMin(static_cast(xmin.get())); - points.SetXMax(static_cast(xmax.get())); - points.SetYMin(static_cast(ymin.get())); - points.SetYMax(static_cast(ymax.get())); + points.SetCenter(xsum.get()/(points.N), + ysum.get()/(points.N)); + points.SetXMin(xmin.get()); + points.SetXMax(xmax.get()); + points.SetYMin(ymin.get()); + points.SetYMax(ymax.get()); m_points=points; } diff --git a/src/basic/REDUCE_STRUCT-Seq.cpp b/src/basic/REDUCE_STRUCT-Seq.cpp index a3bd18b84..71fe7a471 100644 --- a/src/basic/REDUCE_STRUCT-Seq.cpp +++ b/src/basic/REDUCE_STRUCT-Seq.cpp @@ -112,12 +112,12 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune REDUCE_STRUCT_BODY_RAJA; }); - points.SetCenter(static_cast(xsum.get()/(points.N)), - static_cast(ysum.get()/(points.N))); - points.SetXMin(static_cast(xmin.get())); - points.SetXMax(static_cast(xmax.get())); - points.SetYMin(static_cast(ymin.get())); - points.SetYMax(static_cast(ymax.get())); + points.SetCenter(xsum.get()/(points.N), + ysum.get()/(points.N)); + points.SetXMin(xmin.get()); + points.SetXMax(xmax.get()); + points.SetYMin(ymin.get()); + points.SetYMax(ymax.get()); m_points=points; } From c25b96bd69f182c2d8e3cdd08a5251ece515b29b Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Tue, 19 Apr 2022 16:12:02 -0400 Subject: [PATCH 360/392] updating how kernel variables are initialization in test description --- src/basic/REDUCE_STRUCT.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index 5efae03fd..b1d188ca1 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -9,9 +9,9 @@ /// /// REDUCE_STRUCT kernel reference implementation: /// -/// Real_type xsum; Real_type ysum; -/// Real_type xmin; Real_type ymin; -/// Real_type xmax; Real_type ymax; +/// Real_type xsum = m_sum_init; Real_type ysum = m_sum_init; +/// Real_type xmin = m_min_init; Real_type ymin = m_min_init; +/// Real_type xmax = m_max_init; Real_type ymax = m_max_init; /// /// for (Index_type i = ibegin; i < iend; ++i ) { From 7cc7511e4423678789283fad52f9e2ab4c775e69 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 19 Apr 2022 14:23:39 -0700 Subject: [PATCH 361/392] Attempt to get ctest to run properly with disable gpu hooks lrun option. --- scripts/gitlab/build_and_test.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index fbd8df2aa..324f964ad 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -187,14 +187,14 @@ then if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} then echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" - echo "lrun -n1 --smpiargs='-disable_gpu_hooks' ctest --output-on-failure -T test 2>&1 | tee tests_output.txt" + echo "lrun -n1 ... ctest --output-on-failure -T test" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - lrun -n1 --smpiargs="-disable_gpu_hooks" ctest --output-on-failure -T test 2>&1 | tee tests_output.txt + lrun -n1 --smpiargs="-disable_gpu_hooks" ctest --output-on-failure -T test else echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" - echo "lrun -n1 --smpiargs='-disable_gpu_hooks' ctest --output-on-failure -T test 2>&1 | tee tests_output.txt" + echo "lrun -n1 ... ctest --output-on-failure -T test" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - lrun -n1 --smpiargs="-disable_gpu_hooks" ctest --output-on-failure -T test 2>&1 | tee tests_output.txt + lrun -n1 --smpiargs="-disable_gpu_hooks" ctest --output-on-failure -T test fi else if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} From 08e0456d4783ed893f2ca893bd6d4882bd6e65ec Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 19 Apr 2022 15:08:14 -0700 Subject: [PATCH 362/392] enable raja perf test in gitlab CI --- scripts/gitlab/build_and_test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 324f964ad..9dbdb7bc1 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -149,6 +149,7 @@ then cmake \ -C ${hostconfig_path} \ + -DRAJA_PERFSUITE_ENABLE_TESTS=On \ ${project_dir} if echo ${spec} | grep -q "intel" ; then cmake --build . -j 16 From 8419440772311e59b31b2c169102b84e4804dc52 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 19 Apr 2022 17:00:36 -0700 Subject: [PATCH 363/392] Update checksum Weigh each element in an array differently between 0.5 to 1.5. Use kahan sum to reduce error in checksum calculation. --- src/common/DataUtils.cpp | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index 2578661f8..6856d1f6c 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -337,14 +337,21 @@ long double calcChecksum(const Int_ptr ptr, int len, Real_type scale_factor) { long double tchk = 0.0; + long double ckahan = 0.0; for (Index_type j = 0; j < len; ++j) { - tchk += (j+1)*ptr[j]*scale_factor; + long double x = (std::abs(std::sin(j+1.0))+0.5) * ptr[j]; + long double y = x - ckahan; + volatile long double t = tchk + y; + volatile long double z = t - tchk; + ckahan = z - y; + tchk = t; #if 0 // RDH DEBUG if ( (j % 100) == 0 ) { getCout() << "j : tchk = " << j << " : " << tchk << std::endl; } #endif } + tchk *= scale_factor; return tchk; } @@ -352,14 +359,21 @@ long double calcChecksum(const Real_ptr ptr, int len, Real_type scale_factor) { long double tchk = 0.0; + long double ckahan = 0.0; for (Index_type j = 0; j < len; ++j) { - tchk += (j+1)*ptr[j]*scale_factor; + long double x = (std::abs(std::sin(j+1.0))+0.5) * ptr[j]; + long double y = x - ckahan; + volatile long double t = tchk + y; + volatile long double z = t - tchk; + ckahan = z - y; + tchk = t; #if 0 // RDH DEBUG if ( (j % 100) == 0 ) { getCout() << "j : tchk = " << j << " : " << tchk << std::endl; } #endif } + tchk *= scale_factor; return tchk; } @@ -367,14 +381,21 @@ long double calcChecksum(const Complex_ptr ptr, int len, Real_type scale_factor) { long double tchk = 0.0; + long double ckahan = 0.0; for (Index_type j = 0; j < len; ++j) { - tchk += (j+1)*(real(ptr[j])+imag(ptr[j]))*scale_factor; + long double x = (std::abs(std::sin(j+1.0))+0.5) * (real(ptr[j])+imag(ptr[j])); + long double y = x - ckahan; + volatile long double t = tchk + y; + volatile long double z = t - tchk; + ckahan = z - y; + tchk = t; #if 0 // RDH DEBUG if ( (j % 100) == 0 ) { getCout() << "j : tchk = " << j << " : " << tchk << std::endl; } #endif } + tchk *= scale_factor; return tchk; } From e3bb1c8f2829ff6bf3d0ef47f4f51655be36fefc Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 19 Apr 2022 17:01:48 -0700 Subject: [PATCH 364/392] Remove factor of problem size in SCAN checksum scale factor --- src/algorithm/SCAN.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index 07734ceb4..7a4d9091c 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -31,11 +31,10 @@ SCAN::SCAN(const RunParams& params) setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); setFLOPsPerRep(1 * getActualProblemSize()); - Checksum_type actualProblemSize = getActualProblemSize(); checksum_scale_factor = 1e-2 * ( static_cast(getDefaultProblemSize()) / getActualProblemSize() ) / - ( actualProblemSize * (actualProblemSize + 1) / 2 ); + getActualProblemSize(); setUsesFeature(Scan); From db7e7feb1bdd29afe575e849e67fd9145efed8ca Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 20 Apr 2022 09:52:45 -0700 Subject: [PATCH 365/392] Try enabling tests in spack package. --- scripts/gitlab/build_and_test.sh | 1 - scripts/spack_packages/raja_perf/package.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 9dbdb7bc1..324f964ad 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -149,7 +149,6 @@ then cmake \ -C ${hostconfig_path} \ - -DRAJA_PERFSUITE_ENABLE_TESTS=On \ ${project_dir} if echo ${spec} | grep -q "intel" ; then cmake --build . -j 16 diff --git a/scripts/spack_packages/raja_perf/package.py b/scripts/spack_packages/raja_perf/package.py index e7138a562..ce78bb9e9 100644 --- a/scripts/spack_packages/raja_perf/package.py +++ b/scripts/spack_packages/raja_perf/package.py @@ -341,7 +341,7 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cfg.write(cmake_cache_option("ENABLE_OPENMP","+openmp" in spec)) cfg.write(cmake_cache_option("ENABLE_BENCHMARKS", 'tests=benchmarks' in spec)) - cfg.write(cmake_cache_option("ENABLE_TESTS", not 'tests=none' in spec or self.run_tests)) + cfg.write(cmake_cache_option("RAJA_PERFSUITE_ENABLE_TESTS", not 'tests=none' in spec or self.run_tests)) ####################### # Close and save From a29845fac3ed4afa6867e28716c0ea7fbbc95931 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 20 Apr 2022 10:33:45 -0700 Subject: [PATCH 366/392] Try eabling tests for CUDA and HIP Gitlab CI --- scripts/spack_packages/raja_perf/package.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/spack_packages/raja_perf/package.py b/scripts/spack_packages/raja_perf/package.py index ce78bb9e9..1fad534c3 100644 --- a/scripts/spack_packages/raja_perf/package.py +++ b/scripts/spack_packages/raja_perf/package.py @@ -245,7 +245,7 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cudacompiler)) cfg.write(cmake_cache_string("BLT_CXX_STD", "c++14")) - cfg.write(cmake_cache_option("ENABLE_TESTS", False)) + cfg.write(cmake_cache_option("RAJA_PERFSUITE_ENABLE_TESTS", not 'tests=none' in spec or self.run_tests)) if ("xl" in cpp_compiler): cfg.write(cmake_cache_entry("CMAKE_CUDA_FLAGS", "-Xcompiler -O3 -Xcompiler -qxlcompatmacros -Xcompiler -qalias=noansi " + @@ -282,7 +282,7 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cfg.write("#------------------{0}\n\n".format("-" * 60)) cfg.write(cmake_cache_option("ENABLE_HIP", True)) - cfg.write(cmake_cache_option("ENABLE_TESTS", False)) + cfg.write(cmake_cache_option("RAJA_PERFSUITE_ENABLE_TESTS", not 'tests=none' in spec or self.run_tests)) hip_root = spec['hip'].prefix rocm_root = hip_root + "/.." From 35f2ce834adc383d03083c590545d3c0cc01ba27 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 20 Apr 2022 10:50:08 -0700 Subject: [PATCH 367/392] enable tests in spack package and change logic in to-level cmakelists file --- CMakeLists.txt | 9 +++------ scripts/spack_packages/raja_perf/package.py | 6 +++--- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fe768a8e5..b504bfc14 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,20 +33,17 @@ include(blt/SetupBLT.cmake) #cmake_dependent_option(RAJA_PERFSUITE_ENABLE_TESTS "Enable RAJA Perf Suite Tests" On "ENABLE_TESTS" Off) -if (RAJA_PERFSUITE_ENABLE_TESTS) +if (ENABLE_TESTS) - set(ENABLE_TESTS On CACHE BOOL "Enable testing for RAJA Perf Suite") + set(RAJA_PERFSUITE_ENABLE_TESTS On CACHE BOOL "Enable testing for RAJA Perf Suite") set(RAJA_ENABLE_TESTS Off CACHE BOOL "") set(CAMP_ENABLE_TESTS Off CACHE BOOL "") set(BLT_ENABLE_TESTS Off CACHE BOOL "") -else() +endif() -set(ENABLE_TESTS Off CACHE BOOL "Enable BLT, camp, and RAJA tests") set(ENABLE_EXAMPLES Off CACHE BOOL "Enable RAJA examples") -endif() - cmake_dependent_option(RAJA_PERFSUITE_ENABLE_MPI "Build with MPI" On "ENABLE_MPI" Off) cmake_dependent_option(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN "Build OpenMP scan variants" Off "ENABLE_OPENMP" Off) diff --git a/scripts/spack_packages/raja_perf/package.py b/scripts/spack_packages/raja_perf/package.py index 1fad534c3..7c7aa8ac7 100644 --- a/scripts/spack_packages/raja_perf/package.py +++ b/scripts/spack_packages/raja_perf/package.py @@ -245,7 +245,7 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cudacompiler)) cfg.write(cmake_cache_string("BLT_CXX_STD", "c++14")) - cfg.write(cmake_cache_option("RAJA_PERFSUITE_ENABLE_TESTS", not 'tests=none' in spec or self.run_tests)) + cfg.write(cmake_cache_option("ENABLE_TESTS", not 'tests=none' in spec or self.run_tests)) if ("xl" in cpp_compiler): cfg.write(cmake_cache_entry("CMAKE_CUDA_FLAGS", "-Xcompiler -O3 -Xcompiler -qxlcompatmacros -Xcompiler -qalias=noansi " + @@ -282,7 +282,7 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cfg.write("#------------------{0}\n\n".format("-" * 60)) cfg.write(cmake_cache_option("ENABLE_HIP", True)) - cfg.write(cmake_cache_option("RAJA_PERFSUITE_ENABLE_TESTS", not 'tests=none' in spec or self.run_tests)) + cfg.write(cmake_cache_option("ENABLE_TESTS", not 'tests=none' in spec or self.run_tests)) hip_root = spec['hip'].prefix rocm_root = hip_root + "/.." @@ -341,7 +341,7 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cfg.write(cmake_cache_option("ENABLE_OPENMP","+openmp" in spec)) cfg.write(cmake_cache_option("ENABLE_BENCHMARKS", 'tests=benchmarks' in spec)) - cfg.write(cmake_cache_option("RAJA_PERFSUITE_ENABLE_TESTS", not 'tests=none' in spec or self.run_tests)) + cfg.write(cmake_cache_option("ENABLE_TESTS", not 'tests=none' in spec or self.run_tests)) ####################### # Close and save From a800bb4d932fd655e29e9d49077ea5b37d10c78a Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 20 Apr 2022 11:14:52 -0700 Subject: [PATCH 368/392] remove line to disable blt tests --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b504bfc14..6d9de710d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -38,7 +38,6 @@ if (ENABLE_TESTS) set(RAJA_PERFSUITE_ENABLE_TESTS On CACHE BOOL "Enable testing for RAJA Perf Suite") set(RAJA_ENABLE_TESTS Off CACHE BOOL "") set(CAMP_ENABLE_TESTS Off CACHE BOOL "") - set(BLT_ENABLE_TESTS Off CACHE BOOL "") endif() From adb404546d2d1d09e167e9433acb017fdf0406b3 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 20 Apr 2022 14:25:57 -0700 Subject: [PATCH 369/392] Try fixing the test dependency issues, not that other things are cleaner --- Dockerfile | 2 +- test/CMakeLists.txt | 28 +++++++++++++++++----------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/Dockerfile b/Dockerfile index 35f08e0a2..4740fabb9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,7 +9,7 @@ FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-7.3.0 AS gcc7 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DENABLE_OPENMP=On -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DENABLE_OPENMP=On .. && \ make -j 6 &&\ ctest -T test --output-on-failure diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 84306a20e..df570873e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -8,19 +8,25 @@ include_directories(../src) -set(RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS - common - apps - basic - lcals - polybench - stream - algorithm) -#list(APPEND RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) - +#set(RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS +# common +# apps +# basic +# lcals +# polybench +# stream +# algorithm) +##list(APPEND RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) +# +#raja_add_test( +# NAME test-raja-perf-suite +# SOURCES test-raja-perf-suite.cpp +# DEPENDS_ON ${RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS} +# ) + raja_add_test( NAME test-raja-perf-suite SOURCES test-raja-perf-suite.cpp - DEPENDS_ON ${RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS} + DEPENDS_ON ${RAJA_PERFSUITE_DEPENDS} ) From ae7e1da9ff75b3868047c99cb87cd2a2d11c3645 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 20 Apr 2022 15:10:21 -0700 Subject: [PATCH 370/392] Correcting test dependencies --- test/CMakeLists.txt | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index df570873e..b0bd0884d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -8,25 +8,18 @@ include_directories(../src) -#set(RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS -# common -# apps -# basic -# lcals -# polybench -# stream -# algorithm) -##list(APPEND RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) -# -#raja_add_test( -# NAME test-raja-perf-suite -# SOURCES test-raja-perf-suite.cpp -# DEPENDS_ON ${RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS} -# ) +set(RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS + common + apps + basic + lcals + polybench + stream + algorithm) +list(APPEND RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) raja_add_test( NAME test-raja-perf-suite SOURCES test-raja-perf-suite.cpp - DEPENDS_ON ${RAJA_PERFSUITE_DEPENDS} + DEPENDS_ON ${RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS} ) - From c693d6eead22728c3f23b4bcc269601618d99e4d Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 20 Apr 2022 16:11:37 -0700 Subject: [PATCH 371/392] Remove cmake arg to enable tests, no longer needed. --- Dockerfile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index 4740fabb9..037f59b60 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,7 +17,7 @@ FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-8.1.0 AS gcc8 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_OPENMP=On -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_OPENMP=On .. && \ make -j 6 &&\ ctest -T test --output-on-failure @@ -25,7 +25,7 @@ FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-9.4.0 AS gcc9 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DENABLE_OPENMP=On -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DENABLE_OPENMP=On .. && \ make -j 6 &&\ ctest -T test --output-on-failure @@ -33,7 +33,7 @@ FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-11.2.0 AS gcc11 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DENABLE_OPENMP=On -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ +RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DENABLE_OPENMP=On .. && \ make -j 6 &&\ ctest -T test --output-on-failure @@ -42,7 +42,7 @@ ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN . /opt/spack/share/spack/setup-env.sh && spack load llvm && \ - cmake -DCMAKE_CXX_COMPILER=clang++ -DENABLE_OPENMP=On -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ + cmake -DCMAKE_CXX_COMPILER=clang++ -DENABLE_OPENMP=On .. && \ make -j 6 &&\ ctest -T test --output-on-failure @@ -59,7 +59,7 @@ ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN . /opt/spack/share/spack/setup-env.sh && spack load llvm && \ - cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release -DENABLE_OPENMP=On -DRAJA_PERFSUITE_ENABLE_TESTS=On .. && \ + cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release -DENABLE_OPENMP=On .. && \ make -j 6 &&\ ctest -T test --output-on-failure From 14be5cec090d678c1bd688cbf50b6f1b4ddb7755 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 21 Apr 2022 09:55:08 -0700 Subject: [PATCH 372/392] Try fix to RAJA cmake (to turn off openmp). --- test/CMakeLists.txt | 2 +- tpl/RAJA | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b0bd0884d..e71f5cb10 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -18,7 +18,7 @@ set(RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS algorithm) list(APPEND RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) -raja_add_test( +blt_add_executable(( NAME test-raja-perf-suite SOURCES test-raja-perf-suite.cpp DEPENDS_ON ${RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS} diff --git a/tpl/RAJA b/tpl/RAJA index c21ba6316..bf31ae830 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit c21ba63167d977365137f35250d6653390ac19df +Subproject commit bf31ae830f92dc7d4f182ef6c9f479a249accd1a From 2874e958f134c21674c8d83017d25a0cfd6848c7 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 21 Apr 2022 09:58:31 -0700 Subject: [PATCH 373/392] Remove extra left-paren --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e71f5cb10..801f74ddc 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -18,7 +18,7 @@ set(RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS algorithm) list(APPEND RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) -blt_add_executable(( +blt_add_executable( NAME test-raja-perf-suite SOURCES test-raja-perf-suite.cpp DEPENDS_ON ${RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS} From 516c54c7bdd16e72f0628171666b980c054e4fe2 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 21 Apr 2022 10:11:08 -0700 Subject: [PATCH 374/392] Fix macro call to generate test --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 801f74ddc..b0bd0884d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -18,7 +18,7 @@ set(RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS algorithm) list(APPEND RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) -blt_add_executable( +raja_add_test( NAME test-raja-perf-suite SOURCES test-raja-perf-suite.cpp DEPENDS_ON ${RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS} From 23baf531c81c399813772fdff7217ddecf7d96f4 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 21 Apr 2022 12:12:40 -0700 Subject: [PATCH 375/392] add -qstrict option to xl builds to see if this fixes kernel breakage --- .gitlab/lassen-jobs.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index 8f2ff686d..17fd98a24 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -31,13 +31,13 @@ gcc_8_3_1: xl_16_1_1_11: variables: - SPEC: "%xl@16.1.1.11 cxxflags='-qthreaded -std=c++14 -O3 -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036'" + SPEC: "%xl@16.1.1.11 cxxflags='-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036'" DEFAULT_TIME: 50 extends: .build_and_test_on_lassen xl_16_1_1_11_gcc_8_3_1: variables: - SPEC: "%xl@16.1.1.11 cxxflags='--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O3 -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" + SPEC: "%xl@16.1.1.11 cxxflags='--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" DEFAULT_TIME: 50 extends: .build_and_test_on_lassen From 7dfa1ed37810f612e671e417e9f86ab1c4dcd295 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 21 Apr 2022 13:27:47 -0700 Subject: [PATCH 376/392] Exclude halo exchange fused kernel when building for HIP. --- test/test-raja-perf-suite.cpp | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/test/test-raja-perf-suite.cpp b/test/test-raja-perf-suite.cpp index 693f7ef64..77c0ab29a 100644 --- a/test/test-raja-perf-suite.cpp +++ b/test/test-raja-perf-suite.cpp @@ -18,19 +18,24 @@ TEST(ShortSuiteTest, Basic) { - // Assemble command line args for basic test - constexpr int nargs = 4; - int argc = nargs; - - std::vector< std::string > sargv(nargs); - sargv[0] = std::string("dummy"); // for executable name - sargv[1] = std::string("--checkrun"); - sargv[2] = std::string("5"); - sargv[3] = std::string("-sp"); - - char** argv = new char* [nargs]; - for (int is = 0; is < nargs; ++is) { +// Assemble command line args for basic test + int argc = 4; +#if defined(RAJA_ENABLE_HIP) + argc = 6; +#endif + std::vector< std::string > sargv(argc); + sargv[0] = std::string("dummy "); // for executable name + sargv[1] = std::string("--checkrun "); + sargv[2] = std::string("5 "); + sargv[3] = std::string("-show-progress "); +#if defined(RAJA_ENABLE_HIP) + sargv[4] = std::string("--exclude-kernels "); + sargv[5] = std::string("HALOEXCHANGE_FUSED"); +#endif + + char** argv = new char* [argc]; + for (int is = 0; is < argc; ++is) { argv[is] = const_cast(sargv[is].c_str()); } From bad2b3b8752da528d96a709460a25a9458d87f4f Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 22 Apr 2022 10:52:33 -0700 Subject: [PATCH 377/392] enable openmp for tests --- .gitlab/lassen-jobs.yml | 22 +++++++++++----------- .gitlab/ruby-jobs.yml | 8 ++++---- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index 17fd98a24..9b957797a 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -16,7 +16,7 @@ clang_11_0_0: variables: - SPEC: "%clang@11.0.0" + SPEC: "+openmp %clang@11.0.0" extends: .build_and_test_on_lassen #ibm_clang_9_gcc_8: @@ -26,18 +26,18 @@ clang_11_0_0: gcc_8_3_1: variables: - SPEC: "%gcc@8.3.1 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000'" + SPEC: "+openmp %gcc@8.3.1 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000'" extends: .build_and_test_on_lassen xl_16_1_1_11: variables: - SPEC: "%xl@16.1.1.11 cxxflags='-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036'" + SPEC: "+openmp %xl@16.1.1.11 cxxflags='-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036'" DEFAULT_TIME: 50 extends: .build_and_test_on_lassen xl_16_1_1_11_gcc_8_3_1: variables: - SPEC: "%xl@16.1.1.11 cxxflags='--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" + SPEC: "+openmp %xl@16.1.1.11 cxxflags='--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" DEFAULT_TIME: 50 extends: .build_and_test_on_lassen @@ -52,29 +52,29 @@ xl_16_1_1_11_gcc_8_3_1: clang_11_cuda: variables: - SPEC: "+cuda cuda_arch=70 %clang@11.0.0 ^cuda@10.1.168" + SPEC: "+openmp +cuda cuda_arch=70 %clang@11.0.0 ^cuda@10.1.168" extends: .build_and_test_on_lassen gcc_8_3_1_cuda: variables: - SPEC: "+cuda %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168" + SPEC: "+openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168" extends: .build_and_test_on_lassen gcc_8_3_1_cuda_ats_disabled: variables: - SPEC: "+cuda %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168" + SPEC: "+openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168" extends: .build_and_test_on_lassen_ats_disabled xl_16_1_1_7_cuda: variables: - SPEC: "+cuda %xl@16.1.1.7 cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5" + SPEC: "+openmp +cuda %xl@16.1.1.7 cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5" DEFAULT_TIME: 60 allow_failure: true extends: .build_and_test_on_lassen xl_16_1_1_7_gcc_8_3_1_cuda_11: variables: - SPEC: "+cuda %xl@16.1.1.7 cuda_arch=70 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@11.0.2 ^cmake@3.14.5" + SPEC: "+openmp +cuda %xl@16.1.1.7 cuda_arch=70 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@11.0.2 ^cmake@3.14.5" DEFAULT_TIME: 60 allow_failure: true extends: .build_and_test_on_lassen @@ -85,12 +85,12 @@ xl_16_1_1_7_gcc_8_3_1_cuda_11: clang_9_0_0_libcpp (build and test on lassen): variables: - SPEC: "%clang@9.0.0+libcpp" + SPEC: "+openmp %clang@9.0.0+libcpp" extends: .build_and_test_on_lassen clang_9_0_0_memleak (build and test on lassen): variables: - SPEC: "%clang@9.0.0 cxxflags=-fsanitize=address" + SPEC: "+openmp %clang@9.0.0 cxxflags=-fsanitize=address" ASAN_OPTIONS: "detect_leaks=1" extends: .build_and_test_on_lassen diff --git a/.gitlab/ruby-jobs.yml b/.gitlab/ruby-jobs.yml index ca9a8cb45..cb4214c6a 100644 --- a/.gitlab/ruby-jobs.yml +++ b/.gitlab/ruby-jobs.yml @@ -7,17 +7,17 @@ clang_10: variables: - SPEC: "%clang@10.0.1" + SPEC: "+openmp %clang@10.0.1" extends: .build_and_test_on_ruby clang_9: variables: - SPEC: "%clang@9.0.0" + SPEC: "+openmp %clang@9.0.0" extends: .build_and_test_on_ruby gcc_8_1_0: variables: - SPEC: "%gcc@8.1.0" + SPEC: "+openmp %gcc@8.1.0" DEFAULT_TIME: 60 extends: .build_and_test_on_ruby @@ -35,7 +35,7 @@ gcc_8_1_0: icpc_19_1_0: variables: - SPEC: "%intel@19.1.0" + SPEC: "+openmp %intel@19.1.0" DEFAULT_TIME: 40 extends: .build_and_test_on_ruby From dc4bca673d4ac70a9a4da136cace6cc745f28979 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 22 Apr 2022 11:34:42 -0700 Subject: [PATCH 378/392] Fix broken command line args so test actually runs --- test/test-raja-perf-suite.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test-raja-perf-suite.cpp b/test/test-raja-perf-suite.cpp index 77c0ab29a..488ffe5d7 100644 --- a/test/test-raja-perf-suite.cpp +++ b/test/test-raja-perf-suite.cpp @@ -26,11 +26,11 @@ TEST(ShortSuiteTest, Basic) #endif std::vector< std::string > sargv(argc); sargv[0] = std::string("dummy "); // for executable name - sargv[1] = std::string("--checkrun "); - sargv[2] = std::string("5 "); - sargv[3] = std::string("-show-progress "); + sargv[1] = std::string("--checkrun"); + sargv[2] = std::string("5"); + sargv[3] = std::string("--show-progress"); #if defined(RAJA_ENABLE_HIP) - sargv[4] = std::string("--exclude-kernels "); + sargv[4] = std::string("--exclude-kernels"); sargv[5] = std::string("HALOEXCHANGE_FUSED"); #endif From f6eb1c966464f729a7592820d61ef6e31a075fb3 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 22 Apr 2022 13:43:18 -0700 Subject: [PATCH 379/392] update xl compiler for lassen ci. --- .gitlab/lassen-jobs.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index 9b957797a..305109c7d 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -65,16 +65,16 @@ gcc_8_3_1_cuda_ats_disabled: SPEC: "+openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168" extends: .build_and_test_on_lassen_ats_disabled -xl_16_1_1_7_cuda: +xl_16_1_1_11_cuda: variables: - SPEC: "+openmp +cuda %xl@16.1.1.7 cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5" + SPEC: "+openmp +cuda %xl@16.1.1.11 cxxflags='--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5" DEFAULT_TIME: 60 allow_failure: true extends: .build_and_test_on_lassen -xl_16_1_1_7_gcc_8_3_1_cuda_11: +xl_16_1_1_11_gcc_8_3_1_cuda_11: variables: - SPEC: "+openmp +cuda %xl@16.1.1.7 cuda_arch=70 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@11.0.2 ^cmake@3.14.5" + SPEC: "+openmp +cuda %xl@16.1.1.11 cuda_arch=70 cxxflags'=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@11.0.2 ^cmake@3.14.5" DEFAULT_TIME: 60 allow_failure: true extends: .build_and_test_on_lassen From 783116b984dd94d0d9023baa1c244c3aafb50ced Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 22 Apr 2022 13:43:40 -0700 Subject: [PATCH 380/392] more precisely turn off kernels for known CI testing failures based on compiler versions --- test/test-raja-perf-suite.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/test/test-raja-perf-suite.cpp b/test/test-raja-perf-suite.cpp index 488ffe5d7..41d12e3e1 100644 --- a/test/test-raja-perf-suite.cpp +++ b/test/test-raja-perf-suite.cpp @@ -21,7 +21,10 @@ TEST(ShortSuiteTest, Basic) // Assemble command line args for basic test int argc = 4; -#if defined(RAJA_ENABLE_HIP) +#if ( (defined(RAJA_ENABLE_HIP) && \ + (HIP_VERSION_MAJOR < 5 || \ + (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 1)) ) \ + || (defined(RAJA_COMPILER_CLANG) && __clang_major__ == 11) ) argc = 6; #endif std::vector< std::string > sargv(argc); @@ -29,10 +32,16 @@ TEST(ShortSuiteTest, Basic) sargv[1] = std::string("--checkrun"); sargv[2] = std::string("5"); sargv[3] = std::string("--show-progress"); -#if defined(RAJA_ENABLE_HIP) +#if defined(RAJA_ENABLE_HIP) && \ + (HIP_VERSION_MAJOR < 5 || \ + (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 1)) sargv[4] = std::string("--exclude-kernels"); sargv[5] = std::string("HALOEXCHANGE_FUSED"); #endif +#if (defined(RAJA_COMPILER_CLANG) && __clang_major__ == 11) + sargv[4] = std::string("--exclude-kernels"); + sargv[5] = std::string("FIRST_MIN"); +#endif char** argv = new char* [argc]; for (int is = 0; is < argc; ++is) { @@ -106,7 +115,7 @@ TEST(ShortSuiteTest, Basic) << kernel->getVariantTuningName(vid, tune_idx) << std::endl; EXPECT_GT(rtime, 0.0); - EXPECT_LT(cksum_diff, 1e-5); + EXPECT_LT(cksum_diff, 1e-7); } } From 697b87e76dd153914a677bc231dffd466cfdf0a5 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 22 Apr 2022 15:06:36 -0700 Subject: [PATCH 381/392] Re-format and make clearer macro logic --- test/test-raja-perf-suite.cpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/test/test-raja-perf-suite.cpp b/test/test-raja-perf-suite.cpp index 41d12e3e1..60dbd7a29 100644 --- a/test/test-raja-perf-suite.cpp +++ b/test/test-raja-perf-suite.cpp @@ -21,23 +21,30 @@ TEST(ShortSuiteTest, Basic) // Assemble command line args for basic test int argc = 4; -#if ( (defined(RAJA_ENABLE_HIP) && \ - (HIP_VERSION_MAJOR < 5 || \ - (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 1)) ) \ - || (defined(RAJA_COMPILER_CLANG) && __clang_major__ == 11) ) + +#if defined(RAJA_ENABLE_HIP) && \ + (HIP_VERSION_MAJOR < 5 || \ + (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 1)) argc = 6; #endif + +#if (defined(RAJA_COMPILER_CLANG) && __clang_major__ == 11) + argc = 6; +#endif + std::vector< std::string > sargv(argc); sargv[0] = std::string("dummy "); // for executable name sargv[1] = std::string("--checkrun"); sargv[2] = std::string("5"); sargv[3] = std::string("--show-progress"); + #if defined(RAJA_ENABLE_HIP) && \ (HIP_VERSION_MAJOR < 5 || \ (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 1)) sargv[4] = std::string("--exclude-kernels"); sargv[5] = std::string("HALOEXCHANGE_FUSED"); #endif + #if (defined(RAJA_COMPILER_CLANG) && __clang_major__ == 11) sargv[4] = std::string("--exclude-kernels"); sargv[5] = std::string("FIRST_MIN"); From fb777caec62dacd98c93b587796624aaa124d5e5 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 26 Apr 2022 14:07:32 -0700 Subject: [PATCH 382/392] Update submodule to RAJA develop --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index bf31ae830..3125bb70c 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit bf31ae830f92dc7d4f182ef6c9f479a249accd1a +Subproject commit 3125bb70c898b287b35d9d0f0694999e26c6dd93 From 98879efeef0470ee346916c6bf54ecf585d7c634 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 26 Apr 2022 15:06:44 -0700 Subject: [PATCH 383/392] Try backing off to -O2 for XL to see if tests pass. --- .gitlab/lassen-jobs.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index 305109c7d..ed573b27e 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -31,13 +31,13 @@ gcc_8_3_1: xl_16_1_1_11: variables: - SPEC: "+openmp %xl@16.1.1.11 cxxflags='-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036'" + SPEC: "+openmp %xl@16.1.1.11 cxxflags='-qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036'" DEFAULT_TIME: 50 extends: .build_and_test_on_lassen xl_16_1_1_11_gcc_8_3_1: variables: - SPEC: "+openmp %xl@16.1.1.11 cxxflags='--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" + SPEC: "+openmp %xl@16.1.1.11 cxxflags='--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" DEFAULT_TIME: 50 extends: .build_and_test_on_lassen @@ -67,14 +67,14 @@ gcc_8_3_1_cuda_ats_disabled: xl_16_1_1_11_cuda: variables: - SPEC: "+openmp +cuda %xl@16.1.1.11 cxxflags='--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5" + SPEC: "+openmp +cuda %xl@16.1.1.11 cxxflags='--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5" DEFAULT_TIME: 60 allow_failure: true extends: .build_and_test_on_lassen xl_16_1_1_11_gcc_8_3_1_cuda_11: variables: - SPEC: "+openmp +cuda %xl@16.1.1.11 cuda_arch=70 cxxflags'=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@11.0.2 ^cmake@3.14.5" + SPEC: "+openmp +cuda %xl@16.1.1.11 cuda_arch=70 cxxflags'=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@11.0.2 ^cmake@3.14.5" DEFAULT_TIME: 60 allow_failure: true extends: .build_and_test_on_lassen From 36485535dcc9d36fcf2482e261a29e1739cf9b42 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 26 Apr 2022 15:47:58 -0700 Subject: [PATCH 384/392] Try newer XL version --- .gitlab/lassen-jobs.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index ed573b27e..69b37d590 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -29,15 +29,15 @@ gcc_8_3_1: SPEC: "+openmp %gcc@8.3.1 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000'" extends: .build_and_test_on_lassen -xl_16_1_1_11: +xl_16_1_1_12: variables: - SPEC: "+openmp %xl@16.1.1.11 cxxflags='-qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036'" + SPEC: "+openmp %xl@16.1.1.12 cxxflags='-qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036'" DEFAULT_TIME: 50 extends: .build_and_test_on_lassen -xl_16_1_1_11_gcc_8_3_1: +xl_16_1_1_12_gcc_8_3_1: variables: - SPEC: "+openmp %xl@16.1.1.11 cxxflags='--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" + SPEC: "+openmp %xl@16.1.1.12 cxxflags='--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" DEFAULT_TIME: 50 extends: .build_and_test_on_lassen @@ -65,16 +65,16 @@ gcc_8_3_1_cuda_ats_disabled: SPEC: "+openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168" extends: .build_and_test_on_lassen_ats_disabled -xl_16_1_1_11_cuda: +xl_16_1_1_12_cuda: variables: - SPEC: "+openmp +cuda %xl@16.1.1.11 cxxflags='--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5" + SPEC: "+openmp +cuda %xl@16.1.1.12 cxxflags='--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5" DEFAULT_TIME: 60 allow_failure: true extends: .build_and_test_on_lassen -xl_16_1_1_11_gcc_8_3_1_cuda_11: +xl_16_1_1_12_gcc_8_3_1_cuda_11: variables: - SPEC: "+openmp +cuda %xl@16.1.1.11 cuda_arch=70 cxxflags'=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@11.0.2 ^cmake@3.14.5" + SPEC: "+openmp +cuda %xl@16.1.1.12 cuda_arch=70 cxxflags'=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@11.0.2 ^cmake@3.14.5" DEFAULT_TIME: 60 allow_failure: true extends: .build_and_test_on_lassen From 441eeabe7fd89a4a1a6a99ee8a1fa0b02ef17c09 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 27 Apr 2022 08:17:27 -0700 Subject: [PATCH 385/392] remove gcc stuff from XL+CUDA build --- .gitlab/lassen-jobs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index 69b37d590..97c06d529 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -67,7 +67,7 @@ gcc_8_3_1_cuda_ats_disabled: xl_16_1_1_12_cuda: variables: - SPEC: "+openmp +cuda %xl@16.1.1.12 cxxflags='--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5" + SPEC: "+openmp +cuda %xl@16.1.1.12 cxxflags='-qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5" DEFAULT_TIME: 60 allow_failure: true extends: .build_and_test_on_lassen From ca450a53d1aca637e0e295eb7dd624053cf1ac0b Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 27 Apr 2022 09:00:32 -0700 Subject: [PATCH 386/392] Fix spec for HIP architectures --- scripts/spack_packages/raja_perf/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/spack_packages/raja_perf/package.py b/scripts/spack_packages/raja_perf/package.py index 7c7aa8ac7..3888407cc 100644 --- a/scripts/spack_packages/raja_perf/package.py +++ b/scripts/spack_packages/raja_perf/package.py @@ -292,7 +292,7 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): rocm_root)) cfg.write(cmake_cache_entry("HIP_PATH", rocm_root + '/llvm/bin')) - cfg.write(cmake_cache_entry("CMAKE_HIP_ARCHITECTURES", 'fx906')) + cfg.write(cmake_cache_entry("CMAKE_HIP_ARCHITECTURES", 'gfx906')) hipcc_flags = ['--amdgpu-target=gfx906'] From 36aaf97d8a30414ba39b5ed1e9849ba669d56c91 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 27 Apr 2022 11:35:32 -0700 Subject: [PATCH 387/392] CMake cleanup based on reviewer comments --- CMakeLists.txt | 5 +---- test/CMakeLists.txt | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6d9de710d..813d1e9b0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,18 +31,15 @@ include(blt/SetupBLT.cmake) # Define RAJA PERFSUITE settings... # -#cmake_dependent_option(RAJA_PERFSUITE_ENABLE_TESTS "Enable RAJA Perf Suite Tests" On "ENABLE_TESTS" Off) +cmake_dependent_option(RAJA_PERFSUITE_ENABLE_TESTS "Enable RAJA Perf Suite Tests" On "ENABLE_TESTS" Off) if (ENABLE_TESTS) - set(RAJA_PERFSUITE_ENABLE_TESTS On CACHE BOOL "Enable testing for RAJA Perf Suite") set(RAJA_ENABLE_TESTS Off CACHE BOOL "") set(CAMP_ENABLE_TESTS Off CACHE BOOL "") endif() -set(ENABLE_EXAMPLES Off CACHE BOOL "Enable RAJA examples") - cmake_dependent_option(RAJA_PERFSUITE_ENABLE_MPI "Build with MPI" On "ENABLE_MPI" Off) cmake_dependent_option(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN "Build OpenMP scan variants" Off "ENABLE_OPENMP" Off) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b0bd0884d..1fb84321d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -6,8 +6,6 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -include_directories(../src) - set(RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS common apps @@ -23,3 +21,5 @@ raja_add_test( SOURCES test-raja-perf-suite.cpp DEPENDS_ON ${RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS} ) + +target_include_directories(test-raja-perf-suite.exe PRIVATE ../src) From 563ad5d843a127039dd1b78339826b8489a5bb5d Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 27 Apr 2022 11:39:39 -0700 Subject: [PATCH 388/392] CMake change based on review comment --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 1fb84321d..fe0b732f5 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -22,4 +22,4 @@ raja_add_test( DEPENDS_ON ${RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS} ) -target_include_directories(test-raja-perf-suite.exe PRIVATE ../src) +target_include_directories(test-raja-perf-suite.exe PRIVATE ${PROJECT_SOURCE_DIR}/src) From 9acb8b67b0b29b40496dc37d1077228ae2e57631 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 28 Apr 2022 09:19:12 -0700 Subject: [PATCH 389/392] Change spack package to use XL flags we need for tests to pass Make Gitlab CI lassen specs consistent, make sure all build scripts enable C++14 --- .gitlab/lassen-jobs.yml | 8 ++++---- scripts/lc-builds/blueos_clang.sh | 1 + scripts/lc-builds/blueos_clang_omptarget.sh | 1 + scripts/lc-builds/blueos_gcc.sh | 1 + scripts/lc-builds/blueos_nvcc_clang.sh | 1 + scripts/lc-builds/blueos_nvcc_xl.sh | 1 + scripts/lc-builds/blueos_pgi.sh | 1 + scripts/lc-builds/blueos_spectrum_nvcc_clang.sh | 2 +- scripts/lc-builds/blueos_xl.sh | 1 + scripts/lc-builds/blueos_xl_omptarget.sh | 1 + scripts/lc-builds/toss3_clang.sh | 1 + scripts/lc-builds/toss3_gcc.sh | 1 + scripts/lc-builds/toss3_hipcc.sh | 1 + scripts/lc-builds/toss3_icpc.sh | 1 + scripts/lc-builds/toss3_mvapich2_gcc.sh | 1 + scripts/lc-builds/toss3_pgi.sh | 1 + scripts/lc-builds/toss4_amdclang.sh | 1 + scripts/spack_packages/raja_perf/package.py | 3 ++- 18 files changed, 22 insertions(+), 6 deletions(-) diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index 97c06d529..65a64aae5 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -31,13 +31,13 @@ gcc_8_3_1: xl_16_1_1_12: variables: - SPEC: "+openmp %xl@16.1.1.12 cxxflags='-qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036'" + SPEC: "+openmp %xl@16.1.1.12 cxxflags='-qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036'" DEFAULT_TIME: 50 extends: .build_and_test_on_lassen xl_16_1_1_12_gcc_8_3_1: variables: - SPEC: "+openmp %xl@16.1.1.12 cxxflags='--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" + SPEC: "+openmp %xl@16.1.1.12 cxxflags='--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" DEFAULT_TIME: 50 extends: .build_and_test_on_lassen @@ -67,14 +67,14 @@ gcc_8_3_1_cuda_ats_disabled: xl_16_1_1_12_cuda: variables: - SPEC: "+openmp +cuda %xl@16.1.1.12 cxxflags='-qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5" + SPEC: "+openmp +cuda %xl@16.1.1.12 cxxflags='-qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036' cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5" DEFAULT_TIME: 60 allow_failure: true extends: .build_and_test_on_lassen xl_16_1_1_12_gcc_8_3_1_cuda_11: variables: - SPEC: "+openmp +cuda %xl@16.1.1.12 cuda_arch=70 cxxflags'=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@11.0.2 ^cmake@3.14.5" + SPEC: "+openmp +cuda %xl@16.1.1.12 cuda_arch=70 cxxflags'=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@11.0.2 ^cmake@3.14.5" DEFAULT_TIME: 60 allow_failure: true extends: .build_and_test_on_lassen diff --git a/scripts/lc-builds/blueos_clang.sh b/scripts/lc-builds/blueos_clang.sh index 8e071efb8..2329cca3c 100755 --- a/scripts/lc-builds/blueos_clang.sh +++ b/scripts/lc-builds/blueos_clang.sh @@ -36,6 +36,7 @@ module load cmake/3.14.5 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_VER}/bin/clang++ \ + -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ diff --git a/scripts/lc-builds/blueos_clang_omptarget.sh b/scripts/lc-builds/blueos_clang_omptarget.sh index e182afd54..e557c2dac 100755 --- a/scripts/lc-builds/blueos_clang_omptarget.sh +++ b/scripts/lc-builds/blueos_clang_omptarget.sh @@ -36,6 +36,7 @@ module load cmake/3.14.5 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_VER}/bin/clang++ \ + -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DENABLE_CUDA=Off \ diff --git a/scripts/lc-builds/blueos_gcc.sh b/scripts/lc-builds/blueos_gcc.sh index e0316fe3e..b3ecbeb70 100755 --- a/scripts/lc-builds/blueos_gcc.sh +++ b/scripts/lc-builds/blueos_gcc.sh @@ -34,6 +34,7 @@ module load cmake/3.14.5 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/g++ \ + -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ diff --git a/scripts/lc-builds/blueos_nvcc_clang.sh b/scripts/lc-builds/blueos_nvcc_clang.sh index a9bd4bea1..105938283 100755 --- a/scripts/lc-builds/blueos_nvcc_clang.sh +++ b/scripts/lc-builds/blueos_nvcc_clang.sh @@ -41,6 +41,7 @@ module load cmake/3.14.5 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_CLANG_VER}/bin/clang++ \ + -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DENABLE_CUDA=On \ diff --git a/scripts/lc-builds/blueos_nvcc_xl.sh b/scripts/lc-builds/blueos_nvcc_xl.sh index 0f088e38a..950505cfc 100755 --- a/scripts/lc-builds/blueos_nvcc_xl.sh +++ b/scripts/lc-builds/blueos_nvcc_xl.sh @@ -41,6 +41,7 @@ module load cmake/3.14.5 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-${COMP_XL_VER}/bin/xlc++_r \ + -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DENABLE_CUDA=On \ diff --git a/scripts/lc-builds/blueos_pgi.sh b/scripts/lc-builds/blueos_pgi.sh index ee1c708be..d6c915fb9 100755 --- a/scripts/lc-builds/blueos_pgi.sh +++ b/scripts/lc-builds/blueos_pgi.sh @@ -34,6 +34,7 @@ module load cmake/3.14.5 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/pgi/pgi-${COMP_VER}/bin/pgc++ \ + -DBLT_CXX_STD=c++14 \ -DCMAKE_C_COMPILER=/usr/tce/packages/pgi/pgi-${COMP_VER}/bin/pgcc \ -C ${RAJA_HOST_CONFIG} \ -DENABLE_OPENMP=On \ diff --git a/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh b/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh index cfa5a902f..83bcb2903 100755 --- a/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh +++ b/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh @@ -44,7 +44,7 @@ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DMPI_CXX_COMPILER=/usr/tce/packages/spectrum-mpi/spectrum-mpi-${COMP_MPI_VER}-clang-${COMP_CLANG_VER}/bin/mpiclang++ \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_CLANG_VER}/bin/clang++ \ - -DBLT_CXX_STD=c++11 \ + -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_MPI=On \ -DENABLE_OPENMP=On \ diff --git a/scripts/lc-builds/blueos_xl.sh b/scripts/lc-builds/blueos_xl.sh index 3f2131871..8630e419d 100755 --- a/scripts/lc-builds/blueos_xl.sh +++ b/scripts/lc-builds/blueos_xl.sh @@ -34,6 +34,7 @@ module load cmake/3.14.5 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-${COMP_VER}/bin/xlc++_r \ + -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ diff --git a/scripts/lc-builds/blueos_xl_omptarget.sh b/scripts/lc-builds/blueos_xl_omptarget.sh index 3a06872f9..9d18d4622 100755 --- a/scripts/lc-builds/blueos_xl_omptarget.sh +++ b/scripts/lc-builds/blueos_xl_omptarget.sh @@ -34,6 +34,7 @@ module load cmake/3.14.5 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-${COMP_VER}/bin/xlc++_r \ + -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DENABLE_TARGET_OPENMP=On \ diff --git a/scripts/lc-builds/toss3_clang.sh b/scripts/lc-builds/toss3_clang.sh index ce140f0ce..e3b51716d 100755 --- a/scripts/lc-builds/toss3_clang.sh +++ b/scripts/lc-builds/toss3_clang.sh @@ -34,6 +34,7 @@ module load cmake/3.14.5 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_VER}/bin/clang++ \ + -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ diff --git a/scripts/lc-builds/toss3_gcc.sh b/scripts/lc-builds/toss3_gcc.sh index 2ab5a987d..031b01a7b 100755 --- a/scripts/lc-builds/toss3_gcc.sh +++ b/scripts/lc-builds/toss3_gcc.sh @@ -34,6 +34,7 @@ module load cmake/3.14.5 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/g++ \ + -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ diff --git a/scripts/lc-builds/toss3_hipcc.sh b/scripts/lc-builds/toss3_hipcc.sh index c1cca3269..464c8390f 100755 --- a/scripts/lc-builds/toss3_hipcc.sh +++ b/scripts/lc-builds/toss3_hipcc.sh @@ -45,6 +45,7 @@ cmake \ -DCMAKE_C_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/clang \ -DCMAKE_CXX_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/clang++ \ -DHIP_HIPCC_FLAGS=--offload-arch=${COMP_ARCH} \ + -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=OFF \ diff --git a/scripts/lc-builds/toss3_icpc.sh b/scripts/lc-builds/toss3_icpc.sh index 225b7d4ab..9c941742f 100755 --- a/scripts/lc-builds/toss3_icpc.sh +++ b/scripts/lc-builds/toss3_icpc.sh @@ -48,6 +48,7 @@ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/bin/icpc \ -DCMAKE_C_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/bin/icc \ + -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ diff --git a/scripts/lc-builds/toss3_mvapich2_gcc.sh b/scripts/lc-builds/toss3_mvapich2_gcc.sh index 4ade9549c..654f9624f 100755 --- a/scripts/lc-builds/toss3_mvapich2_gcc.sh +++ b/scripts/lc-builds/toss3_mvapich2_gcc.sh @@ -36,6 +36,7 @@ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DMPI_CXX_COMPILER=/usr/tce/packages/mvapich2/mvapich2-${MPI_VER}-gcc-${COMP_VER}/bin/mpic++ \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/g++ \ + -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_MPI=On \ -DENABLE_OPENMP=On \ diff --git a/scripts/lc-builds/toss3_pgi.sh b/scripts/lc-builds/toss3_pgi.sh index 0c0e33a81..cd778d5fe 100755 --- a/scripts/lc-builds/toss3_pgi.sh +++ b/scripts/lc-builds/toss3_pgi.sh @@ -35,6 +35,7 @@ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/pgi/pgi-${COMP_VER}/bin/pgc++ \ -DCMAKE_C_COMPILER=/usr/tce/packages/pgi/pgi-${COMP_VER}/bin/pgcc \ + -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ diff --git a/scripts/lc-builds/toss4_amdclang.sh b/scripts/lc-builds/toss4_amdclang.sh index 655990fa1..4b063be04 100755 --- a/scripts/lc-builds/toss4_amdclang.sh +++ b/scripts/lc-builds/toss4_amdclang.sh @@ -64,6 +64,7 @@ cmake \ -DCMAKE_C_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/amdclang \ -DCMAKE_CXX_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/amdclang++ \ -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ + -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=OFF \ diff --git a/scripts/spack_packages/raja_perf/package.py b/scripts/spack_packages/raja_perf/package.py index 3888407cc..c0ba13602 100644 --- a/scripts/spack_packages/raja_perf/package.py +++ b/scripts/spack_packages/raja_perf/package.py @@ -248,7 +248,8 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cfg.write(cmake_cache_option("ENABLE_TESTS", not 'tests=none' in spec or self.run_tests)) if ("xl" in cpp_compiler): - cfg.write(cmake_cache_entry("CMAKE_CUDA_FLAGS", "-Xcompiler -O3 -Xcompiler -qxlcompatmacros -Xcompiler -qalias=noansi " + + cfg.write(cmake_cache_entry("CMAKE_CUDA_FLAGS", "-Xcompiler -O2 -Xcompiler -qstrict " + + "-Xcompiler -qxlcompatmacros -Xcompiler -qalias=noansi " + "-Xcompiler -qsmp=omp -Xcompiler -qhot -Xcompiler -qnoeh -Xcompiler -qsuppress=1500-029 " + "-Xcompiler -qsuppress=1500-036 -Xcompiler -qsuppress=1500-030")) cuda_release_flags = "-O3" From 45c725d450e039e076581bed9b39069878467f77 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 29 Apr 2022 07:49:01 -0700 Subject: [PATCH 390/392] Update RAJA submodule to pull in new spack configs for hip 5.1.0 --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index 3125bb70c..87a5cac67 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 3125bb70c898b287b35d9d0f0694999e26c6dd93 +Subproject commit 87a5cac67214e5e96c941bd652b1c0981e9f2123 From 4bb6caa12add1fd2d08e64b6dc03654d2d9430cb Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 29 Apr 2022 07:52:48 -0700 Subject: [PATCH 391/392] Switch corona CI to HIP 5.1.0 --- .gitlab/corona-jobs.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.gitlab/corona-jobs.yml b/.gitlab/corona-jobs.yml index 303141a6b..4b9428f3a 100644 --- a/.gitlab/corona-jobs.yml +++ b/.gitlab/corona-jobs.yml @@ -5,12 +5,12 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################# -hip_4_5_2_clang_13_0_0 (build and test on corona): +hip_5.1.0_clang_13_0_0 (build and test on corona): variables: - SPEC: "+rocm~openmp amdgpu_target=gfx906 %clang@13.0.0 cxxflags=--offload-arch=gfx906 ^blt@develop ^hip@4.5.2" + SPEC: "+rocm~openmp amdgpu_target=gfx906 %clang@13.0.0 ^blt@develop ^hip@5.1.0" extends: .build_and_test_on_corona -#hip_4_5_2_clang_13_0_0_desul_atomics (build and test on corona): +#hip_5.1.0_clang_13_0_0_desul_atomics (build and test on corona): # variables: -# SPEC: "+rocm~openmp +desul amdgpu_target=gfx906 %clang@13.0.0 cxxflags=--offload-arch=gfx906 ^blt@develop ^hip@4.5.2" +# SPEC: "+rocm~openmp +desul amdgpu_target=gfx906 %clang@13.0.0 ^blt@develop ^hip@5.1.0" # extends: .build_and_test_on_corona From f955bded3ebe512a21e10c7eadaa4d3ef9f283f4 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 29 Apr 2022 09:42:23 -0700 Subject: [PATCH 392/392] Now that test is passing, turn off allow failure for XL+CUDA --- .gitlab/lassen-jobs.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml index 65a64aae5..034de13eb 100644 --- a/.gitlab/lassen-jobs.yml +++ b/.gitlab/lassen-jobs.yml @@ -69,14 +69,12 @@ xl_16_1_1_12_cuda: variables: SPEC: "+openmp +cuda %xl@16.1.1.12 cxxflags='-qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036' cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5" DEFAULT_TIME: 60 - allow_failure: true extends: .build_and_test_on_lassen xl_16_1_1_12_gcc_8_3_1_cuda_11: variables: SPEC: "+openmp +cuda %xl@16.1.1.12 cuda_arch=70 cxxflags'=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O2 -qstrict -qxlcompatmacros -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@11.0.2 ^cmake@3.14.5" DEFAULT_TIME: 60 - allow_failure: true extends: .build_and_test_on_lassen ##########