From 9386133ec74ce3d8582165e9038c8d4d6df26297 Mon Sep 17 00:00:00 2001 From: Luis Altenkort Date: Thu, 7 Jul 2022 13:28:24 +0200 Subject: [PATCH 01/14] add log level RESULT --- src/base/IO/logging.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/base/IO/logging.h b/src/base/IO/logging.h index 75f78825..a5111f68 100644 --- a/src/base/IO/logging.h +++ b/src/base/IO/logging.h @@ -17,8 +17,8 @@ #include "stringFunctions.h" -enum LogLevel { ALL, ALLOC, TRACE, DEBUG, INFO, WARN, ERROR, FATAL, OFF }; -static const char *LogLevelStr[] = {"ALL", "ALLOC", "TRACE", "DEBUG", "INFO", +enum LogLevel { ALL, ALLOC, TRACE, DEBUG, INFO, RESULT, WARN, ERROR, FATAL, OFF }; +static const char *LogLevelStr[] = {"ALL", "ALLOC", "TRACE", "DEBUG", "INFO", "RESULT", "WARN", "ERROR", "FATAL", "OFF"}; class Logger { @@ -106,10 +106,17 @@ class Logger { template inline std::string debug(Args&&... args) { return message(std::forward(args)...); }; + + /// Something seems odd but the program will continue running fine nevertheless. template inline std::string warn(Args&&... args) { return message(std::forward(args)...); }; + /// Use this for test results + template inline std::string result(Args&&... args) { + return message(std::forward(args)...); + }; + /*! Use this when something goes wrong but the program can still continue * Example: a test gives the wrong results */ From 560f11fa056ed9322e1700f8069ffa524ba2bb67 Mon Sep 17 00:00:00 2001 From: Luis Altenkort Date: Thu, 7 Jul 2022 13:28:35 +0200 Subject: [PATCH 02/14] change loglevel for CheckConf and CheckRand --- src/applications/main_CheckConf.cpp | 8 ++++++-- src/applications/main_CheckRand.cpp | 10 +++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/applications/main_CheckConf.cpp b/src/applications/main_CheckConf.cpp index 0c753688..571cc95f 100644 --- a/src/applications/main_CheckConf.cpp +++ b/src/applications/main_CheckConf.cpp @@ -70,7 +70,8 @@ void CheckConf(CommunicationBase &commBase, const std::string& format, std::stri int main(int argc, char *argv[]) { try { - stdLogger.setVerbosity(INFO); + stdLogger.setVerbosity(RESULT); + rootLogger.setVerbosity(RESULT); const size_t HaloDepth = 0; CheckParams param; @@ -81,6 +82,9 @@ int main(int argc, char *argv[]) { commBase.init(param.nodeDim()); initIndexer(HaloDepth, param, commBase); + rootLogger.setVerbosity(INFO); + rootLogger.info("Checking Gaugefile ", param.GaugefileName()); + rootLogger.setVerbosity(RESULT); if (param.prec() == "single"){ CheckConf(commBase, param.format(), param.GaugefileName()); @@ -94,6 +98,6 @@ int main(int argc, char *argv[]) { catch (const std::runtime_error &error) { return 1; } - rootLogger.info("Gaugefile seems to be fine."); + rootLogger.result("Gaugefile OK! (readin, plaquette, unitarity)"); return 0; } diff --git a/src/applications/main_CheckRand.cpp b/src/applications/main_CheckRand.cpp index 501e5346..32eb6274 100644 --- a/src/applications/main_CheckRand.cpp +++ b/src/applications/main_CheckRand.cpp @@ -17,7 +17,9 @@ void CheckRand(CommunicationBase &commBase, const std::string& rand_file){ int main(int argc, char *argv[]) { try { - stdLogger.setVerbosity(INFO); + rootLogger.info("Checking Randfile..."); + stdLogger.setVerbosity(RESULT); + rootLogger.setVerbosity(RESULT); const size_t HaloDepth = 0; CheckParams param; @@ -27,12 +29,14 @@ int main(int argc, char *argv[]) { param.readfile(commBase, "../parameter/applications/CheckRand.param", argc, argv); commBase.init(param.nodeDim()); initIndexer(HaloDepth, param, commBase); - + rootLogger.setVerbosity(INFO); + rootLogger.info("Checking Randfile ", param.Randfile()); + rootLogger.setVerbosity(RESULT); CheckRand(commBase, param.Randfile()); } catch (const std::runtime_error &error) { return 1; } - rootLogger.info("Random state seems to be fine."); + rootLogger.result("Randfile OK!"); return 0; } From 2b49fb89567ea74458f2a00ff4836813575872d2 Mon Sep 17 00:00:00 2001 From: Luis Altenkort Date: Thu, 7 Jul 2022 17:43:54 +0200 Subject: [PATCH 03/14] add cmake error when using GPU_P2P without using GPU_AWARE_MPI --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index ec4aa1d0..0ab519a3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -114,6 +114,9 @@ if (USE_GPU_P2P) endif() add_definitions(-DUSE_GPU_P2P) endif() +if (USE_GPU_P2P AND NOT USE_GPU_AWARE_MPI) + message(FATAL_ERROR "USE_GPU_P2P only works with USE_GPU_AWARE_MPI") +endif() # Additional compiler flags From 91a52548cd3f155c2e7f9e264b25fc10fdb2ff3f Mon Sep 17 00:00:00 2001 From: Luis Altenkort Date: Thu, 7 Jul 2022 17:45:16 +0200 Subject: [PATCH 04/14] make CheckConf compile using hip --- src/applications/main_CheckConf.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/applications/main_CheckConf.cpp b/src/applications/main_CheckConf.cpp index 571cc95f..504b2ae4 100644 --- a/src/applications/main_CheckConf.cpp +++ b/src/applications/main_CheckConf.cpp @@ -8,10 +8,10 @@ struct CheckParams : LatticeParameters { } }; -template +template struct do_check_unitarity { - explicit do_check_unitarity(Gaugefield &gauge) : gAcc(gauge.getAccessor()) {}; + explicit do_check_unitarity(Gaugefield &gauge) : gAcc(gauge.getAccessor()) {}; gaugeAccessor gAcc; __device__ __host__ floatT operator()(gSite site){ typedef GIndexer GInd; @@ -25,13 +25,13 @@ struct do_check_unitarity } }; -template -void check_unitarity(Gaugefield &gauge) +template +void check_unitarity(Gaugefield &gauge) { - LatticeContainer unitarity(gauge.getComm()); + LatticeContainer unitarity(gauge.getComm()); const size_t elems = GIndexer::getLatData().vol4; unitarity.adjustSize(elems); - unitarity.template iterateOverBulk(do_check_unitarity(gauge)); + unitarity.template iterateOverBulk(do_check_unitarity(gauge)); floatT unit_norm; unitarity.reduce(unit_norm, elems); unit_norm /= static_cast(GIndexer::getLatData().globvol4); @@ -54,7 +54,7 @@ void CheckConf(CommunicationBase &commBase, const std::string& format, std::stri } else { throw (std::runtime_error(rootLogger.fatal("Invalid specification for format ", format))); } - check_unitarity(gauge); + check_unitarity(gauge); GaugeAction gaugeAction(gauge); floatT plaq = gaugeAction.plaquette(); From d1d6e158834645e90fccf1d2e71a7586a657db4c Mon Sep 17 00:00:00 2001 From: Luis Altenkort Date: Thu, 7 Jul 2022 18:08:00 +0200 Subject: [PATCH 05/14] make CheckConf actually compile using hip --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0ab519a3..05b3443d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -747,6 +747,7 @@ set_SIMULATeQCD_property(configConverter PROPERTIES RUNTIME_OUTPUT_DIRECTORY "ap SIMULATeQCD_target_compile_definitions(configConverter PRIVATE HALODEPTH_0=1 SINGLEPREC=1 DOUBLEPREC=1 COMP_R18=1 NSTACKS_1=1 LAYOUT_ALL=1) add_to_compound_SIMULATeQCD_target(applications configConverter) +set_SIMULATeQCD_gpu_backend(src/applications/main_CheckConf.cpp) add_SIMULATeQCD_executable(CheckConf src/applications/main_CheckConf.cpp) set_SIMULATeQCD_property(CheckConf PROPERTIES RUNTIME_OUTPUT_DIRECTORY "applications") SIMULATeQCD_target_compile_definitions(CheckConf PRIVATE HALODEPTH_0=1 COMP_R18=1 SINGLEPREC=1 DOUBLEPREC=1 CPUONLY=1) From d0c7522a248a68ba12d03d598171eb9b03d770a9 Mon Sep 17 00:00:00 2001 From: Luis Altenkort Date: Wed, 13 Jul 2022 10:50:16 +0200 Subject: [PATCH 06/14] gradientFlowTest: remove unused variables --- src/testing/main_gradientFlowTest.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/testing/main_gradientFlowTest.cpp b/src/testing/main_gradientFlowTest.cpp index a78ae211..bc1769fa 100644 --- a/src/testing/main_gradientFlowTest.cpp +++ b/src/testing/main_gradientFlowTest.cpp @@ -59,7 +59,6 @@ bool run(Gaugefield &gauge, //! initialize some values for the measurement floatT flow_time = 0; - floatT plaq, clov, topChar; std::stringstream logStream, logStream_ref; logStream << std::fixed << std::setprecision(std::numeric_limits::digits10 + 1); logStream_ref << std::fixed << std::setprecision(std::numeric_limits::digits10 + 1); From c2364e1890fcac453076f609bef66bd730a0f9ed Mon Sep 17 00:00:00 2001 From: Luis Altenkort Date: Tue, 23 Aug 2022 12:37:00 +0200 Subject: [PATCH 07/14] add first step to experimental support for -DBACKEND=cpu. only tested for CheckConf and CheckRand. note: iterateOverBulkAllMu needs some changes, I ignored that for now. --- CMakeLists.txt | 45 ++- src/applications/main_CheckConf.cpp | 2 +- src/base/LatticeContainer.cpp | 13 +- src/base/LatticeContainer.h | 45 ++- src/base/LatticeDimension.h | 49 +-- src/base/communication/HaloLoop.h | 28 +- .../communication/calcGSiteHalo_dynamic.h | 20 +- .../communication/communicationBase_mpi.cpp | 8 +- src/base/communication/deviceEvent.h | 3 + src/base/communication/deviceStream.h | 3 + src/base/communication/gpuIPC.h | 2 + src/base/communication/haloOffsetInfo.h | 39 +- src/base/communication/neighborInfo.h | 10 +- src/base/communication/siteComm.h | 35 +- src/base/gutils.cpp | 2 + src/base/gutils.h | 13 +- src/base/indexer/BulkIndexer.h | 267 +++++++------- src/base/indexer/HaloIndexer.h | 162 +++++---- src/base/indexer/initGPUIndexer.cpp | 3 + src/base/math/correlators.h | 58 +-- src/base/math/floatComparison.h | 4 +- src/base/math/gaugeAccessor.h | 28 +- src/base/math/gaugeConstructor.h | 52 +-- src/base/math/gcomplex.h | 344 +++++++++--------- src/base/math/generalAccessor.h | 6 +- src/base/math/grnd.cpp | 4 +- src/base/math/grnd.h | 17 +- src/base/math/gsu2.h | 62 ++-- src/base/math/gsu3.h | 331 +++++++++-------- src/base/math/gvect3.h | 191 +++++----- src/base/math/gvect3array.h | 22 +- src/base/math/matrix4x4.h | 22 +- src/base/math/operators.h | 38 +- src/base/math/simpleArray.h | 32 +- src/base/math/su3Exp.h | 4 +- src/base/memoryManagement.cpp | 4 + src/base/memoryManagement.h | 50 ++- src/base/runFunctors.h | 96 +++-- src/define.h | 36 +- src/gauge/GaugeAction.cpp | 10 +- src/gauge/GaugeAction.h | 10 +- src/gauge/constructs/PlaqConstructs.h | 8 +- src/gauge/constructs/derivative3link.h | 2 +- src/gauge/constructs/derivative5link.h | 42 +-- src/gauge/constructs/derivative7link.h | 2 +- src/gauge/constructs/derivativeLepagelink.h | 2 +- .../derivativeProjectU3Constructs.h | 2 +- src/gauge/constructs/fat7LinkConstructs.h | 10 +- src/gauge/constructs/gsvd.h | 4 +- src/gauge/constructs/hisqForceConstructs.h | 36 +- src/gauge/constructs/linkLepageConstructs.h | 4 +- src/gauge/constructs/linkStaple3Constructs.h | 4 +- src/gauge/constructs/linkStaple5Constructs.h | 2 +- src/gauge/constructs/linkStaple7Constructs.h | 2 +- src/gauge/constructs/naikConstructs.h | 4 +- .../constructs/naikDerivativeConstructs.h | 2 +- src/gauge/constructs/projectU3Constructs.h | 2 +- src/gauge/gaugeActionDeriv.h | 8 +- src/gauge/gauge_kernels.cpp | 18 +- src/gauge/gaugefield.h | 8 +- src/gauge/gaugefield_device.cpp | 16 +- src/modules/HISQ/staggeredPhases.h | 6 +- src/modules/observables/FieldStrengthTensor.h | 8 +- src/spinor/spinorfield.h | 14 +- 64 files changed, 1296 insertions(+), 1080 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c970fb2c..8f8a122e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,6 +34,10 @@ elseif (BACKEND STREQUAL "hip_amd") set(USE_HIP ON) set(USE_HIP_AMD ON) message(STATUS "Using HIP backend for AMD GPUs. (Experimental!)") +elseif (BACKEND STREQUAL "cpu") + message(STATUS "Using CPU only (no GPU backend). Experimental!") + add_definitions(-DUSE_CPU_ONLY) + #set(USE_CPU_ONLY ON) else() message(STATUS "Backend ${BACKEND} is not known!") endif() @@ -49,6 +53,8 @@ elseif (USE_HIP_NVIDIA) # This will hopefully work in the future # project(SIMULATeQCD LANGUAGES CXX HIP) project(SIMULATeQCD LANGUAGES CXX CUDA) +else() + project(SIMULATeQCD LANGUAGES CXX) endif() set(CMAKE_CXX_STANDARD 17) @@ -109,23 +115,28 @@ elseif (USE_HIP_AMD) set(CMAKE_HIP_ARCHITECTURES "${ARCHITECTURE}") endif() -if(NOT ARCHITECTURE) +if(NOT ARCHITECTURE AND NOT BACKEND MATCHES "cpu") message(FATAL_ERROR "No GPU architecture set!") endif() set(USE_GPU_AWARE_MPI OFF CACHE BOOL "Set to ON to build gpu-aware MPI code (default = OFF)") -if (USE_GPU_AWARE_MPI) - add_definitions(-DUSE_GPU_AWARE_MPI) -endif() set(USE_GPU_P2P ON CACHE BOOL "Set to ON to build with GPU Direct P2P (default = ON)") -if (USE_GPU_P2P) - if(USE_HIP) - message(FATAL_ERROR "GPU_P2P is not supported by HIP!") +if(NOT BACKEND MATCHES "cpu") + if (USE_GPU_AWARE_MPI) + add_definitions(-DUSE_GPU_AWARE_MPI) endif() - add_definitions(-DUSE_GPU_P2P) -endif() -if (USE_GPU_P2P AND NOT USE_GPU_AWARE_MPI) - message(FATAL_ERROR "USE_GPU_P2P only works with USE_GPU_AWARE_MPI") + if (USE_GPU_P2P) + if(USE_HIP) + message(FATAL_ERROR "GPU_P2P is not supported by HIP!") + endif() + add_definitions(-DUSE_GPU_P2P) + endif() + if (USE_GPU_P2P AND NOT USE_GPU_AWARE_MPI) + message(FATAL_ERROR "USE_GPU_P2P only works with USE_GPU_AWARE_MPI") + endif() +else() + set(USE_GPU_AWARE_MPI OFF) + set(USE_GPU_P2P OFF) endif() @@ -364,6 +375,14 @@ FUNCTION(add_SIMULATeQCD_executable TARGET) set_target_properties(${TARGET} _${TARGET} PROPERTIES COMPILE_FLAGS "${MPI_COMPILE_FLAGS}" LINK_FLAGS "${MPI_LINK_FLAGS}" HIP_SEPARABLE_COMPILATION ON CUDA_SEPARABLE_COMPILATION ON LINKER_LANGUAGE CUDA CUDA_RESOLVE_DEVICE_SYMBOLS ON) + else() + add_executable(${TARGET} ${ARGN} ${SOURCE_FILES_BASE}) # single target + add_executable(_${TARGET} ${ARGN}) # compound target (e.g. in "tests", "applications") + + target_link_libraries("_${TARGET}" CodeBase) + set_target_properties(${TARGET} _${TARGET} + PROPERTIES + COMPILE_FLAGS "${MPI_COMPILE_FLAGS}" LINK_FLAGS "${MPI_LINK_FLAGS}") endif() ENDFUNCTION() @@ -648,6 +667,10 @@ elseif (USE_HIP_NVIDIA) hip_add_executable(_SimpleFunctorTest src/testing/main_SimpleFunctorTest.cpp src/base/communication/communicationBase_mpi.cpp src/base/gutils.cpp) set_target_properties(SimpleFunctorTest _SimpleFunctorTest PROPERTIES COMPILE_FLAGS "${MPI_COMPILE_FLAGS}" LINK_FLAGS "${MPI_LINK_FLAGS}" HIP_SEPARABLE_COMPILATION ON CUDA_SEPARABLE_COMPILATION ON LINKER_LANGUAGE CUDA CUDA_RESOLVE_DEVICE_SYMBOLS ON RUNTIME_OUTPUT_DIRECTORY "testing") +else() + add_executable(SimpleFunctorTest src/testing/main_SimpleFunctorTest.cpp src/base/communication/communicationBase_mpi.cpp src/base/gutils.cpp) + add_executable(_SimpleFunctorTest src/testing/main_SimpleFunctorTest.cpp src/base/communication/communicationBase_mpi.cpp src/base/gutils.cpp) + set_target_properties(SimpleFunctorTest _SimpleFunctorTest PROPERTIES COMPILE_FLAGS "${MPI_COMPILE_FLAGS}" LINK_FLAGS "${MPI_LINK_FLAGS}" RUNTIME_OUTPUT_DIRECTORY "testing") endif() target_compile_definitions(SimpleFunctorTest PRIVATE HALODEPTH_0=1 DOUBLEPREC=1 SINGLEPREC=1 ARCHITECTURE=${ARCHITECTURE} GIT_HASH="${GIT_HASH}") target_compile_definitions(_SimpleFunctorTest PRIVATE HALODEPTH_0=1 DOUBLEPREC=1 SINGLEPREC=1 ARCHITECTURE=${ARCHITECTURE} GIT_HASH="${GIT_HASH}") diff --git a/src/applications/main_CheckConf.cpp b/src/applications/main_CheckConf.cpp index 504b2ae4..8c2655ba 100644 --- a/src/applications/main_CheckConf.cpp +++ b/src/applications/main_CheckConf.cpp @@ -13,7 +13,7 @@ struct do_check_unitarity { explicit do_check_unitarity(Gaugefield &gauge) : gAcc(gauge.getAccessor()) {}; gaugeAccessor gAcc; - __device__ __host__ floatT operator()(gSite site){ + HOST_DEVICE floatT operator()(gSite site){ typedef GIndexer GInd; floatT ret=0.0; for (size_t mu = 0; mu < 4; ++mu) diff --git a/src/base/LatticeContainer.cpp b/src/base/LatticeContainer.cpp index e97400cd..3a3f07a5 100644 --- a/src/base/LatticeContainer.cpp +++ b/src/base/LatticeContainer.cpp @@ -12,23 +12,23 @@ #define gpucub hipcub #endif - +#ifndef USE_CPU_ONLY template -gpuError_t CubReduce(void *helpArr, size_t *temp_storage_bytes, floatT *Arr, floatT *out, size_t size) { +GPUERROR_T CubReduce(void *helpArr, size_t *temp_storage_bytes, floatT *Arr, floatT *out, size_t size) { return gpucub::DeviceReduce::Sum(helpArr, *temp_storage_bytes, static_cast(Arr), out, size); } template -gpuError_t CubReduceMax(void *helpArr, size_t *temp_storage_bytes, void *Arr, floatT *out, size_t size) { +GPUERROR_T CubReduceMax(void *helpArr, size_t *temp_storage_bytes, void *Arr, floatT *out, size_t size) { return gpucub::DeviceReduce::Max(helpArr, *temp_storage_bytes, static_cast(Arr), out, size); } template -gpuError_t +GPUERROR_T CubReduceStacked(void *helpArr, size_t *temp_storage_bytes, void *Arr, void *out, int Nt, void *StackOffsets) { return gpucub::DeviceSegmentedReduce::Sum(helpArr, *temp_storage_bytes, static_cast(Arr), @@ -37,8 +37,8 @@ CubReduceStacked(void *helpArr, size_t *temp_storage_bytes, void *Arr, void *out } #define CLASS_INIT(floatT) \ -template gpuError_t CubReduce(void * helpArr, size_t *temp_storage_bytes, floatT* Arr, floatT* out, size_t size); \ -template gpuError_t CubReduceStacked(void * helpArr, size_t *temp_storage_bytes, void * Arr, void* out, int Nt, void *StackOffsets); \ +template GPUERROR_T CubReduce(void * helpArr, size_t *temp_storage_bytes, floatT* Arr, floatT* out, size_t size); \ +template GPUERROR_T CubReduceStacked(void * helpArr, size_t *temp_storage_bytes, void * Arr, void* out, int Nt, void *StackOffsets); \ CLASS_INIT(float) @@ -68,3 +68,4 @@ CLASS_INITMAX(double) CLASS_INITMAX(int) +#endif diff --git a/src/base/LatticeContainer.h b/src/base/LatticeContainer.h index 4da17dd8..013b9904 100644 --- a/src/base/LatticeContainer.h +++ b/src/base/LatticeContainer.h @@ -25,13 +25,13 @@ #include "math/operators.h" template -gpuError_t CubReduce(void *helpArr, size_t *temp_storage_bytes, floatT *Arr, floatT *out, size_t size); +GPUERROR_T CubReduce(void *helpArr, size_t *temp_storage_bytes, floatT *Arr, floatT *out, size_t size); template -gpuError_t CubReduceMax(void *helpArr, size_t *temp_storage_bytes, void *Arr, floatT *out, size_t size); +GPUERROR_T CubReduceMax(void *helpArr, size_t *temp_storage_bytes, void *Arr, floatT *out, size_t size); template -gpuError_t CubReduceStacked(void *helpArr, size_t *temp_storage_bytes, +GPUERROR_T CubReduceStacked(void *helpArr, size_t *temp_storage_bytes, void *Arr, void *out, int Nt, void *TimeSliceOffsets); @@ -47,30 +47,30 @@ class LatticeContainerAccessor : public MemoryAccessor { /// Set values. template - __device__ __host__ inline void setElement(const size_t isite, const floatT value) { + HOST_DEVICE inline void setElement(const size_t isite, const floatT value) { auto *arr = reinterpret_cast(Array); arr[isite] = value; } template - __device__ __host__ inline void setElement(const gSite& site, const floatT value) { + HOST_DEVICE inline void setElement(const gSite& site, const floatT value) { setValue(site.isite, value); } template - __device__ __host__ inline void setElement(const gSiteStack& site, const floatT value) { + HOST_DEVICE inline void setElement(const gSiteStack& site, const floatT value) { setValue(site.isiteStack, value); } /// Get values. template - __device__ __host__ floatT getElement(const gSite& site) { + HOST_DEVICE floatT getElement(const gSite& site) { return getElement(site.isite); } template - __device__ __host__ floatT getElement(const gSiteStack& site) { + HOST_DEVICE floatT getElement(const gSiteStack& site) { return getElement(site.isiteStack); } template - __device__ __host__ inline floatT getElement(const size_t isite) { + HOST_DEVICE inline floatT getElement(const size_t isite) { auto *arr = reinterpret_cast(Array); return arr[isite]; } @@ -169,6 +169,7 @@ class LatticeContainer : public RunFunctors values.resize(NStacks); } +#ifndef USE_CPU_ONLY if (onDevice) { ReductionResult->template adjustSize(NStacks); ReductionResultHost->template adjustSize(NStacks); @@ -178,7 +179,7 @@ class LatticeContainer : public RunFunctors for (size_t i = 0; i < NStacks; i++) { /// Determine temporary device storage requirements size_t temp_storage_bytes = 0; - gpuError_t gpuErr = CubReduce(NULL, &temp_storage_bytes, + GPUERROR_T gpuErr = CubReduce(NULL, &temp_storage_bytes, ContainerArray->template getPointer(i*stackSize), ReductionResult->template getPointer(i), stackSize); if (gpuErr) GpuError("LatticeContainer::reduceStackedLocal: gpucub::DeviceReduce::Sum (1)", gpuErr); @@ -205,7 +206,7 @@ class LatticeContainer : public RunFunctors StackOffsetsTemp->copyFrom(StackOffsetsHostTemp, sizeof(size_t)*(NStacks+1)); /// Determine temporary device storage requirements size_t temp_storage_bytes = 0; - gpuError_t gpuErr = CubReduceStacked(NULL, &temp_storage_bytes, + GPUERROR_T gpuErr = CubReduceStacked(NULL, &temp_storage_bytes, ContainerArray->getPointer(), ReductionResult->getPointer(), NStacks, StackOffsetsTemp->getPointer()); @@ -229,7 +230,9 @@ class LatticeContainer : public RunFunctors values[i] = tmp; } - } else { + } else +#endif + { LatticeContainerAccessor acc = getAccessor(); for (size_t stack = 0; stack < NStacks; stack++) { values[stack] = 0; @@ -250,12 +253,12 @@ class LatticeContainer : public RunFunctors void reduce(elemType &value, size_t size, bool rootToAll = false) { elemType result = 0; - +#ifndef USE_CPU_ONLY if (onDevice) { // Determine temporary device storage requirements size_t temp_storage_bytes = 0; - gpuError_t gpuErr = CubReduce(NULL, &temp_storage_bytes, ContainerArray->template getPointer(), + GPUERROR_T gpuErr = CubReduce(NULL, &temp_storage_bytes, ContainerArray->template getPointer(), d_out->template getPointer(), size); if (gpuErr) GpuError("LatticeContainer::reduce: gpucub::DeviceReduce::Sum (1)", gpuErr); @@ -269,7 +272,9 @@ class LatticeContainer : public RunFunctors if (gpuErr) GpuError("Reductionbase.h: Failed to copy data", gpuErr); - } else{ + } else +#endif + { LatticeContainerAccessor acc = getAccessor(); for (size_t i = 0; i < size; i++){ result += acc.getElement(i); @@ -281,14 +286,14 @@ class LatticeContainer : public RunFunctors void reduceMax(elemType &value, size_t size, bool rootToAll = false) { elemType result = 0; - +#ifndef USE_CPU_ONLY if (onDevice) { // elemType *d_out = NULL; // gpuMalloc((void **) &d_out, sizeof(elemType)); // Determine temporary device storage requirements size_t temp_storage_bytes = 0; - gpuError_t gpuErr = CubReduceMax(NULL, &temp_storage_bytes, ContainerArray->getPointer(), + GPUERROR_T gpuErr = CubReduceMax(NULL, &temp_storage_bytes, ContainerArray->getPointer(), d_out->template getPointer(), size); if (gpuErr) GpuError("LatticeContainer::reduceMax: gpucub::DeviceReduce::Max (1)", gpuErr); @@ -301,7 +306,9 @@ class LatticeContainer : public RunFunctors if (gpuErr) GpuError("Reductionbase.h: Failed to copy data", gpuErr); // gpuFree(d_out); - } else { + } else +#endif + { rootLogger.warn("Max Host reduction has not been properly tested. Check the results and remove this warning"); LatticeContainerAccessor acc = getAccessor(); for (size_t i = 0; i < size; i++){ @@ -387,7 +394,7 @@ void LatticeContainer::iterateOverBulkStacked(Functor op) { template struct WriteAtTimeSlices { - inline __host__ __device__ size_t operator()(const gSite &site) { + inline HOST_DEVICE size_t operator()(const gSite &site) { return GIndexer::siteTimeOrdered(site); } }; diff --git a/src/base/LatticeDimension.h b/src/base/LatticeDimension.h index 9c391808..59a9e16a 100644 --- a/src/base/LatticeDimension.h +++ b/src/base/LatticeDimension.h @@ -8,6 +8,7 @@ #ifndef JUST_INDEXER_LATTICEDIMENSION_H #define JUST_INDEXER_LATTICEDIMENSION_H + #include #include #include "wrapper/gpu_wrapper.h" @@ -19,14 +20,14 @@ private : public : //! Copy constructor - __host__ LatticeDimensions(const LatticeDimensions &lhs) { + HOST LatticeDimensions(const LatticeDimensions &lhs) { for (int i = 0; i < 4; i++) c[i] = lhs.c[i]; } LatticeDimensions& operator=(const LatticeDimensions& a) = default; //! Default constructor, initializes to (0,0,0,0) - __host__ LatticeDimensions() { + HOST LatticeDimensions() { c[0] = 0; c[1] = 0; c[2] = 0; @@ -34,7 +35,7 @@ public : } //! Construct from (x,y,z,t) - __host__ LatticeDimensions(const int x, const int y, const int z, const int t) { + HOST LatticeDimensions(const int x, const int y, const int z, const int t) { c[0] = x; c[1] = y; c[2] = z; @@ -42,39 +43,39 @@ public : } //! Construct from int* (also works with Parameter) - __host__ LatticeDimensions(const int *dim) { + HOST LatticeDimensions(const int *dim) { for (int i = 0; i < 4; i++) c[i] = dim[i]; } //! Cast to int* (for usage in c-style MPI functions) - __host__ operator int *() { return c; } + HOST operator int *() { return c; } //! same with const - __host__ operator const int *() const { return c; }; + HOST operator const int *() const { return c; }; //! [] operator for member access - __host__ int &operator[](int mu) { return c[mu]; }; + HOST int &operator[](int mu) { return c[mu]; }; //! const [] operator for r/o member access - __host__ const int &operator[](int mu) const { return c[mu]; }; + HOST const int &operator[](int mu) const { return c[mu]; }; //! Component-wise multiplication, (x1*x2, y1*y2, z1*z2, t1*t2) - __host__ LatticeDimensions operator*(const LatticeDimensions lhs) const { + HOST LatticeDimensions operator*(const LatticeDimensions lhs) const { LatticeDimensions ret; for (int i = 0; i < 4; i++) ret.c[i] = c[i] * lhs.c[i]; return ret; } //! Component-wise division, (x1/x2, y1/y2, z1/z2, t1/t2) - __host__ LatticeDimensions operator/(const LatticeDimensions lhs) const { + HOST LatticeDimensions operator/(const LatticeDimensions lhs) const { LatticeDimensions ret; for (int i = 0; i < 4; i++) ret.c[i] = c[i] / lhs.c[i]; return ret; } //! modulo operation that returns coordinates within 0<=x= 4)) throw std::runtime_error(stdLogger.fatal("Wrong mu in LatticeDimensions")); c[mu] += ((plus) ? (1) : (-1)); } //! Formatted (debug) output - __host__ friend std::ostream &operator<<(std::ostream &str, + HOST friend std::ostream &operator<<(std::ostream &str, const LatticeDimensions &in) { str << "( "; for (int i = 0; i < 4; i++) str << in.c[i] << " "; @@ -124,13 +125,13 @@ public : //! Return all four entries multiplied - __host__ long mult() const { + HOST long mult() const { long res = 1; for (int i = 0; i < 4; i++) res *= (long) c[i]; return res; } - __host__ long summed() const { + HOST long summed() const { long res = 0; for (int i = 0; i < 4; i++) res += (long) c[i]; return res; @@ -138,11 +139,11 @@ public : //! Return if x,y,z,t are 0<=x= c[i])) return false; return true; @@ -150,13 +151,13 @@ public : //! Return an offset matching given coordinates. With input x,y,z,t //! this returns x + y*LX + z*LX*LY + t*LX*LY*LZ - __host__ size_t offset(const LatticeDimensions &in) const { + HOST size_t offset(const LatticeDimensions &in) const { size_t ret = in[0] + c[0] * in[1] + c[0] * c[1] * in[2] + c[0] * c[1] * c[2] * in[3]; return ret; } //! Return the lowest entry - __host__ int lowest_value() const { + HOST int lowest_value() const { int res = c[1]; for (int i = 0; i < 4; i++) if (c[i] < res)res = c[i]; @@ -164,7 +165,7 @@ public : } //! Return the lowest entry - __host__ int lowest_spatial_value() const { + HOST int lowest_spatial_value() const { int res = c[1]; for (int i = 0; i < 3; i++) if (c[i] < res)res = c[i]; diff --git a/src/base/communication/HaloLoop.h b/src/base/communication/HaloLoop.h index 30ffe9ce..3bd86e9a 100644 --- a/src/base/communication/HaloLoop.h +++ b/src/base/communication/HaloLoop.h @@ -32,7 +32,7 @@ struct ExtractInnerHaloSeg { ExtractInnerHaloSeg(Accessor acc, Accessor hal_acc) : _acc(acc), _hal_acc(hal_acc) {} - inline __host__ __device__ void operator()(HaloSite site) { + inline HOST_DEVICE void operator()(HaloSite site) { for (size_t mu = 0; mu < ElemCount; mu++) { size_t index = _acc.template getIndexComm(site.LatticeIndex, mu); @@ -79,16 +79,21 @@ class extractLoop { ExtractInnerHaloSeg extractLeft(acc, hal_acc); #ifdef DYNAMIC_HALO_LOOP +#ifdef USE_CPU_ONLY + void* stream = nullptr; +#else + GPUSTREAM_T_ stream = segmentInfo.getDeviceStream(streamNo); +#endif iterateFunctorNoReturn(extractLeft, CalcInnerHaloSegIndexComm( hseg, subIndex), - length, 1, 1, segmentInfo.getDeviceStream(streamNo)); + length, 1, 1, stream); #else iterateFunctorNoReturn(extractLeft, CalcInnerHaloSegIndexComm(), - length, 1, 1, segmentInfo.getDeviceStream(streamNo)); + length, 1, 1, stream); #endif - +#ifndef USE_CPU_ONLY if (info.p2p && onDevice && commBase.useGpuP2P()) { deviceEventPair &p2pCopyEvent = HalInfo.getMyGpuEventPair(hseg, dir, leftRight); p2pCopyEvent.start.record(segmentInfo.getDeviceStream()); @@ -106,6 +111,7 @@ class extractLoop { deviceEventPair &p2pCopyEvent = HalInfo.getMyGpuEventPair(hseg, dir, leftRight); p2pCopyEvent.stop.record(segmentInfo.getDeviceStream()); } +#endif } } } @@ -142,7 +148,7 @@ struct InjectOuterHaloSeg { _acc(acc), _hal_acc(hal_acc) { } - inline __host__ __device__ void operator()(HaloSite site) { + inline HOST_DEVICE void operator()(HaloSite site) { for (size_t mu = 0; mu < ElemCount; mu++) { size_t index = _acc.template getIndexComm(site.LatticeIndex, mu); @@ -180,7 +186,7 @@ class injectLoop { if (size != 0) { int streamNo = 1; - +#ifndef USE_CPU_ONLY if (info.p2p && onDevice && commBase.useGpuP2P()) { deviceEvent &p2pCopyEvent = HalInfo.getGpuEventPair(hseg, dir, leftRight).stop; p2pCopyEvent.streamWaitForMe(segmentInfo.getDeviceStream(streamNo)); @@ -189,6 +195,7 @@ class injectLoop { if (onDevice && commBase.useGpuP2P() && info.sameRank) { segmentInfo.synchronizeStream(0); } +#endif if (!onDevice || (onDevice && !commBase.useGpuP2P())) { segmentInfo.synchronizeRequest(); } @@ -197,15 +204,20 @@ class injectLoop { Accessor(pointer, size)); #ifdef DYNAMIC_HALO_LOOP +#ifdef USE_CPU_ONLY + void* stream = nullptr; +#else + GPUSTREAM_T_ stream = segmentInfo.getDeviceStream(streamNo); +#endif iterateFunctorNoReturn(injectLeft, CalcOuterHaloSegIndexComm( hseg, subIndex), - length, 1, 1, segmentInfo.getDeviceStream(streamNo)); + length, 1, 1, stream); #else iterateFunctorNoReturn(injectLeft, CalcOuterHaloSegIndexComm(), - length, 1, 1, segmentInfo.getDeviceStream(streamNo)); + length, 1, 1, stream); #endif } } diff --git a/src/base/communication/calcGSiteHalo_dynamic.h b/src/base/communication/calcGSiteHalo_dynamic.h index 2d39053c..9b62e371 100644 --- a/src/base/communication/calcGSiteHalo_dynamic.h +++ b/src/base/communication/calcGSiteHalo_dynamic.h @@ -27,7 +27,7 @@ struct CalcOuterHaloIndexComm { typedef HaloIndexer HInd; typedef GIndexer GInd; - inline __host__ __device__ HaloSite + inline HOST_DEVICE HaloSite operator()(const dim3 &blockDim, const uint3 &blockIdx, const uint3 &threadIdx) { HaloSite site; @@ -46,7 +46,7 @@ struct CalcInnerHaloIndexComm { typedef HaloIndexer HInd; typedef GIndexer GInd; - inline __host__ __device__ HaloSite + inline HOST_DEVICE HaloSite operator()(const dim3 &blockDim, const uint3 &blockIdx, const uint3 &threadIdx) { HaloSite site; @@ -69,7 +69,7 @@ struct CalcOuterHaloSegCoord{ CalcOuterHaloSegCoord(HaloSegment hseg, short leftRight) : hseg(hseg), leftRight(leftRight){} - inline __host__ __device__ sitexyzt + inline HOST_DEVICE sitexyzt operator()(size_t LocHalIndex) { sitexyzt coord(0, 0, 0, 0); @@ -102,7 +102,7 @@ struct CalcOuterHaloSegIndexComm{ CalcOuterHaloSegIndexComm(HaloSegment hseg, short leftRight) : calcSegCoord(hseg,leftRight){} - inline __host__ __device__ HaloSite + inline HOST_DEVICE HaloSite operator()(const dim3 &blockDim, const uint3 &blockIdx, const uint3 &threadIdx) { HaloSite site; @@ -125,7 +125,7 @@ struct CalcInnerSegCoord{ CalcInnerSegCoord(HaloSegment hseg, short leftRight) : hseg(hseg), leftRight(leftRight){} - inline __host__ __device__ sitexyzt + inline HOST_DEVICE sitexyzt operator()(size_t LocHalIndex) { sitexyzt coord(0, 0, 0, 0); @@ -160,7 +160,7 @@ struct CalcInnerHaloSegCoord{ CalcInnerHaloSegCoord(HaloSegment hseg, short leftRight) : hseg(hseg), leftRight(leftRight){} - inline __host__ __device__ sitexyzt + inline HOST_DEVICE sitexyzt operator()(size_t LocHalIndex) { sitexyzt coord(0, 0, 0, 0); @@ -192,7 +192,7 @@ struct CalcInnerHaloSegIndexComm{ CalcInnerHaloSegIndexComm(HaloSegment hseg, short leftRight) : calcSegCoord(hseg,leftRight){} - inline __host__ __device__ HaloSite + inline HOST_DEVICE HaloSite operator()(const dim3 &blockDim, const uint3 &blockIdx, const uint3 &threadIdx) { HaloSite site; @@ -214,7 +214,7 @@ struct CalcGSiteHaloSeg { CalcGSiteHaloSeg(CalcIndexOp calcIndexOp, HaloSegment hseg, short leftRight) : calcIndexOp(calcIndexOp), calcSegCoord(hseg,leftRight) { } - inline __host__ __device__ auto + inline HOST_DEVICE auto operator()(size_t HaloIndex, size_t mu) { sitexyzt coord = calcSegCoord(HaloIndex); @@ -234,7 +234,7 @@ struct CalcGSiteInnerHalo { CalcGSiteInnerHalo(CalcIndexOp calcIndexOp) : calcIndexOp(calcIndexOp) { } - inline __host__ __device__ auto + inline HOST_DEVICE auto operator()(size_t HaloIndex, size_t mu) { sitexyzt coord = HInd::getInnerCoord(HaloIndex); auto site = calcIndexOp(GInd::getSite(coord.x, coord.y, coord.z, coord.t), mu); @@ -252,7 +252,7 @@ struct CalcGSiteCenter { CalcGSiteCenter(CalcIndexOp calcIndexOp) : calcIndexOp(calcIndexOp) { } - inline __host__ __device__ auto + inline HOST_DEVICE auto operator()(size_t HaloIndex, size_t mu) { sitexyzt coord = HInd::getCenterCoord(HaloIndex); auto site = calcIndexOp(GInd::getSite(coord.x, coord.y, coord.z, coord.t), mu); diff --git a/src/base/communication/communicationBase_mpi.cpp b/src/base/communication/communicationBase_mpi.cpp index 2950f3b9..75db47f8 100644 --- a/src/base/communication/communicationBase_mpi.cpp +++ b/src/base/communication/communicationBase_mpi.cpp @@ -551,8 +551,10 @@ int CommunicationBase::updateSegment(HaloSegment hseg, size_t direction, int rank = info.world_rank; - gpuError_t gpuErr; + if (seg.getLength() != 0) { +#ifndef USE_CPU_ONLY + gpuError_t gpuErr; if (onDevice && info.p2p && useGpuP2P()) { // communicate via GPUDirect uint8_t *sendBase = seg.getMyDeviceSourcePtr(); @@ -590,7 +592,9 @@ int CommunicationBase::updateSegment(HaloSegment hseg, size_t direction, MPI_Isend(sendBase, 1, seg.getMpiType(), rank, indexDest, cart_comm, &seg.getRequestSend()); MPI_Irecv(recvBase, 1, seg.getMpiType(), rank, index, cart_comm, &seg.getRequestRecv()); - } else { + } else +#endif + { uint8_t *sendBase = seg.getHostSendPtr(); uint8_t *recvBase = seg.getHostRecvPtr(); diff --git a/src/base/communication/deviceEvent.h b/src/base/communication/deviceEvent.h index 061a7e2c..f25bebfc 100644 --- a/src/base/communication/deviceEvent.h +++ b/src/base/communication/deviceEvent.h @@ -8,6 +8,8 @@ #ifndef DEVICEEVENT_H #define DEVICEEVENT_H +#ifndef USE_CPU_ONLY + #include "../../define.h" #include "../gutils.h" #include "../wrapper/gpu_wrapper.h" @@ -103,4 +105,5 @@ class deviceEvent { } }; +#endif #endif //DEVICEEVENT_H diff --git a/src/base/communication/deviceStream.h b/src/base/communication/deviceStream.h index f5c58588..c8264cab 100644 --- a/src/base/communication/deviceStream.h +++ b/src/base/communication/deviceStream.h @@ -8,6 +8,8 @@ #ifndef DEVICESTREAM_H #define DEVICESTREAM_H +#ifndef USE_CPU_ONLY + #include "../../define.h" #include "../gutils.h" @@ -71,4 +73,5 @@ class deviceStream { }; +#endif #endif //DEVICEEVENT_H diff --git a/src/base/communication/gpuIPC.h b/src/base/communication/gpuIPC.h index 10411ef2..8407885f 100644 --- a/src/base/communication/gpuIPC.h +++ b/src/base/communication/gpuIPC.h @@ -8,6 +8,7 @@ #ifndef GPUIPC_H #define GPUIPC_H +#ifndef USE_CPU_ONLY #include #include "../../define.h" #include @@ -434,4 +435,5 @@ class gpuIPCEvent { }; +#endif #endif //GPUIPC_H diff --git a/src/base/communication/haloOffsetInfo.h b/src/base/communication/haloOffsetInfo.h index f9bf6df9..e882bf2a 100644 --- a/src/base/communication/haloOffsetInfo.h +++ b/src/base/communication/haloOffsetInfo.h @@ -38,7 +38,11 @@ struct HaloSegmentInfo { bool recvReqUsed = false; MPI_Request hostRequestSend = 0; MPI_Request hostRequestRecv = 0; +#ifndef USE_CPU_ONLY std::vector deviceStream; +#else + std::vector deviceStream; +#endif std::vector streamUsed; public: @@ -146,13 +150,13 @@ struct HaloSegmentInfo { recvReqUsed = true; return hostRequestRecv; } - +#ifndef USE_CPU_ONLY gpuStream_t &getDeviceStream(int ind = 0) { streamUsed[ind] = true; return deviceStream[ind]; } - +#endif void init(__attribute__((unused)) int deviceStreamCount = 2) { #ifndef CPUONLY @@ -165,6 +169,7 @@ struct HaloSegmentInfo { hostRequestRecv = MPI_REQUEST_NULL; } +#ifndef USE_CPU_ONLY int addDeviceStream() { deviceStream.emplace_back(); @@ -176,12 +181,16 @@ struct HaloSegmentInfo { streamUsed[lastIndex] = false; return lastIndex; } +#endif void synchronizeAll() { +#ifndef USE_CPU_ONLY synchronizeStream(); +#endif synchronizeRequest(); } +#ifndef USE_CPU_ONLY void synchronizeStream(int streamIndex = -1) { if (streamIndex == -1){ for (unsigned int i = 0; i < deviceStream.size();i++){ @@ -200,6 +209,7 @@ struct HaloSegmentInfo { } } } +#endif void synchronizeRequest(int sendRecv = -1) { @@ -233,7 +243,7 @@ struct HaloSegmentInfo { uint8_t *getMyDeviceDestinationPtr() { return &destinationBase->template getPointer()[reverseOffset]; } - +#ifndef USE_CPU_ONLY uint8_t *getDeviceDestinationPtrP2P() { return destinationBase->getOppositeP2PPointer(oppositeP2PRank) + reverseOffset; } @@ -241,12 +251,15 @@ struct HaloSegmentInfo { uint8_t *getDeviceDestinationPtrGPUAwareMPI() { return &destinationBase->template getPointer()[offset]; } +#endif ~HaloSegmentInfo() { +#ifndef USE_CPU_ONLY for (auto & i : deviceStream) { gpuError_t gpuErr = gpuStreamDestroy(i); if (gpuErr != gpuSuccess) GpuError("haloOffsetInfo.h: gpuStreamDestroy", gpuErr); } +#endif if ((hostRequestSend != MPI_REQUEST_NULL) && (hostRequestSend != 0)) { MPI_Request_free(&hostRequestSend); @@ -280,7 +293,11 @@ class HaloOffsetInfo { std::array _stripeInfo; std::array _cornerInfo; +#ifndef USE_CPU_ONLY gpuIPCEvent _cIPCEvent; +#else + void* _cIPCEvent; +#endif public: @@ -299,8 +316,11 @@ class HaloOffsetInfo { sendBaseP2P(MemoryManagement::getMemAt("sendBaseP2P")), recvBaseP2P(MemoryManagement::getMemAt("recvBaseP2P")), _gpuAwareMPI(gpuAwareMPI), - _gpuP2P(gpuP2P), - _cIPCEvent(cart_comm, _myRank) { + _gpuP2P(gpuP2P) +#ifndef USE_CPU_ONLY + ,_cIPCEvent(cart_comm, _myRank) +#endif + { for (auto &HypPlane : HaloHypPlanes) { _HalSegMapLeft[HypPlane] = _hypPlaneInfo.data(); @@ -420,6 +440,7 @@ class HaloOffsetInfo { recvBaseP2P = ptr; } +#ifndef USE_CPU_ONLY void initP2P() { if (_gpuP2P && onDevice) { sendBaseP2P->initP2P(cart_comm, _myRank); @@ -427,6 +448,7 @@ class HaloOffsetInfo { // _cIPCEvent = gpuIPCEvent(cart_comm,_myRank); } } +#endif gMemoryPtr getRecvBaseP2P() { return recvBaseP2P; @@ -435,7 +457,7 @@ class HaloOffsetInfo { void setMemoryPointer(HaloSegmentInfo &segInfo, ProcessInfo &neighborInfo, int index, int oppositeIndex) { segInfo.setRecvBase(recvBase); segInfo.setSendBase(sendBase); - +#ifndef USE_CPU_ONLY if (onDevice) { segInfo.setSourceBase(sendBaseP2P); if (_gpuP2P || _gpuAwareMPI)segInfo.setDestinationBase(recvBaseP2P); @@ -453,8 +475,10 @@ class HaloOffsetInfo { _cIPCEvent.addP2PRank(index, oppositeIndex, neighborInfo.world_rank); } } +#endif } +#ifndef USE_CPU_ONLY void syncAndInitP2PRanks() { if (onDevice && _gpuP2P) { sendBaseP2P->syncAndInitP2PRanks(); @@ -462,6 +486,7 @@ class HaloOffsetInfo { _cIPCEvent.syncAndInitAllP2PRanks(); } } +#endif void exchangeHandles() { for (const HaloSegment &hseg : AllHaloSegments) { @@ -517,6 +542,7 @@ class HaloOffsetInfo { } } +#ifndef USE_CPU_ONLY deviceEventPair &getGpuEventPair(HaloSegment hseg, size_t direction, bool leftRight) { int oppositeIndex = haloSegmentCoordToIndex(hseg, direction, !leftRight); int rank = neighbor_info.getNeighborInfo(hseg, direction, leftRight).world_rank; @@ -528,6 +554,7 @@ class HaloOffsetInfo { int rank = neighbor_info.getNeighborInfo(hseg, direction, leftRight).world_rank; return _cIPCEvent.getMyEventPair(index, rank); } +#endif HaloSegmentInfo &get(HaloSegment hseg, size_t direction, bool leftRight) { size_t pos_l = getSegTypeOffset(hseg, direction); diff --git a/src/base/communication/neighborInfo.h b/src/base/communication/neighborInfo.h index 649f6580..b853d1e3 100644 --- a/src/base/communication/neighborInfo.h +++ b/src/base/communication/neighborInfo.h @@ -62,8 +62,11 @@ class NeighborInfo { ProcessInfo _XYZT[8][2]; ProcessInfo fail; - +#ifndef USE_CPU_ONLY gpuDeviceProp myProp; +#else + void* myProp; +#endif inline void _fill2DNeighbors(ProcessInfo array[][2], int mu, int nu); @@ -427,6 +430,7 @@ inline void NeighborInfo::exchangeProcessInfo() { rootLogger.debug("> Neighbor information collected!"); } +#ifndef USE_CPU_ONLY inline bool NeighborInfo::IsGPUCapableP2P() const { // This requires two processes accessing each device, so we need // to ensure exclusive or prohibited mode is not set @@ -437,9 +441,7 @@ inline bool NeighborInfo::IsGPUCapableP2P() const { } - inline void NeighborInfo::checkP2P() { - //! This checks for (const HaloSegment &hseg : AllHaloSegments) { for (int dir = 0; dir < HaloSegmentDirections(hseg); dir++) { @@ -476,5 +478,7 @@ inline void NeighborInfo::checkP2P() { , "UVA ", "Unknown (HIP does not support this!)"); #endif } +#endif + #endif //NEIGHBORINFO_H diff --git a/src/base/communication/siteComm.h b/src/base/communication/siteComm.h index 667422a2..7b4a3b48 100644 --- a/src/base/communication/siteComm.h +++ b/src/base/communication/siteComm.h @@ -26,7 +26,7 @@ #include "../stopWatch.h" #include #include "../runFunctors.h" -#ifndef USE_HIP_AMD +#if defined(USE_CUDA) || defined(USE_HIP_NVIDIA) #include "nvToolsExt.h" #endif #include "deviceEvent.h" @@ -93,16 +93,20 @@ class siteComm : public RunFunctors { HaloInfo.setSendBase(_haloBuffer_Host); HaloInfo.setRecvBase(_haloBuffer_Host_recv); +#ifndef USE_CPU_ONLY if (onDevice) { HaloInfo.setSendBaseP2P(_haloBuffer_Device); if (commB.gpuAwareMPIAvail() || commB.useGpuP2P()) { HaloInfo.setRecvBaseP2P(_haloBuffer_Device_recv); } } +#endif HaloData haldat = HInd::getHalData(); +#ifndef USE_CPU_ONLY if (commB.getNumberProcesses() > 1 && onDevice) HaloInfo.initP2P(); +#endif for (const auto &hypPlane : HaloHypPlanes) { size_t pos_l = 0, pos_r = 0; @@ -161,9 +165,10 @@ class siteComm : public RunFunctors { off_r, haldat.get_SubHaloSize(pos_r, LatLayout) * _halElementSize); } } - if (commB.getNumberProcesses() > 1 && onDevice) HaloInfo.syncAndInitP2PRanks(); #ifndef CPUONLY + if (commB.getNumberProcesses() > 1 && onDevice) HaloInfo.syncAndInitP2PRanks(); + gpuError_t gpuErr = gpuDeviceSynchronize(); if (gpuErr != gpuSuccess) { GpuError("siteComm.h: siteComm constructor, gpuDeviceSynchronize failed:", gpuErr); @@ -215,8 +220,10 @@ class siteComm : public RunFunctors { void updateAll(unsigned int param = AllTypes | COMM_BOTH) { - gpuError_t gpuErr; +#ifndef USE_CPU_ONLY + gpuError_t gpuErr; +#endif /// A check that we don't have multiGPU and halosize=0: if (_commBase.getNumberProcesses() != 1 && HaloDepth == 0) { throw std::runtime_error(stdLogger.fatal("Useless call of CommunicationBase.updateAll() with multiGPU and HaloDepth=0!")); @@ -234,7 +241,7 @@ class siteComm : public RunFunctors { } if (commtype & COMM_START) { - +#ifndef USE_CPU_ONLY if (onDevice) { if (!(_commBase.gpuAwareMPIAvail() || _commBase.useGpuP2P())) { _extractHalos(getAccessor(), _haloBuffer_Device->template getPointer()); @@ -249,7 +256,9 @@ class siteComm : public RunFunctors { _extractHalosSeg(getAccessor(), _haloBuffer_Device->template getPointer(), param); } - } else { + } else +#endif + { _extractHalos(getAccessor(), _haloBuffer_Host->template getPointer()); _commBase.updateAll(HaloInfo, COMM_START | haltype); } @@ -257,7 +266,7 @@ class siteComm : public RunFunctors { if (commtype & COMM_FINISH) { - +#ifndef USE_CPU_ONLY if (onDevice) { if (_commBase.gpuAwareMPIAvail() || _commBase.useGpuP2P()) { _injectHalosSeg(getAccessor(), _haloBuffer_Device_recv->template getPointer(), @@ -271,7 +280,9 @@ class siteComm : public RunFunctors { GpuError("_haloBuffer_Device: Failed to copy to device", gpuErr); _injectHalos(getAccessor(), _haloBuffer_Device->template getPointer()); } - } else { + } else +#endif + { _commBase.updateAll(HaloInfo, COMM_FINISH | haltype); _injectHalos(getAccessor(), _haloBuffer_Host_recv->template getPointer()); } @@ -299,7 +310,7 @@ struct ExtractInnerHalo { } } - inline __host__ __device__ void operator()(HaloSite site) { + inline HOST_DEVICE void operator()(HaloSite site) { Accessor _hal_acc(pointer[site.HalNumber], size[site.HalNumber]); @@ -343,7 +354,7 @@ struct InjectOuterHalo { } } - inline __host__ __device__ void operator()(HaloSite site) { + inline HOST_DEVICE void operator()(HaloSite site) { Accessor _hal_acc(pointer[site.HalNumber], size[site.HalNumber]); for (size_t mu = 0; mu < ElemCount; mu++) { @@ -377,12 +388,12 @@ void siteComm loop(acc, _commBase, HaloInfo, HaloBuffer, param); @@ -399,10 +410,12 @@ void siteComm(ceilf(static_cast(elems) / static_cast(blockDim.x))); @@ -36,6 +36,7 @@ __host__ void inline compute_dim3(dim3 &blockDim, dim3 &gridDim, /** * Utility class to report errors in GPU code. */ +#ifndef USE_CPU_ONLY class GpuError { public: explicit GpuError(gpuError_t err); @@ -49,13 +50,11 @@ class GpuError { private: gpuError_t gpuErr; }; - +#endif /** * Utility method for speedy testing of whether a number is odd */ -__device__ __host__ inline bool isOdd(int cand) { return (cand & 0x1); } - - +HOST_DEVICE inline bool isOdd(int cand) { return (cand & 0x1); } #endif /* UTIL_H */ diff --git a/src/base/indexer/BulkIndexer.h b/src/base/indexer/BulkIndexer.h index 684cb45d..c9f32ccf 100644 --- a/src/base/indexer/BulkIndexer.h +++ b/src/base/indexer/BulkIndexer.h @@ -16,6 +16,7 @@ #ifndef INDEXERDEVICE #define INDEXERDEVICE + #include "../../define.h" #include "../gutils.h" #include @@ -29,8 +30,8 @@ struct sitexyzt { int y; int z; int t; - __device__ __host__ sitexyzt(int x, int y, int z, int t) : x(x), y(y), z(z), t(t) {}; - __device__ __host__ inline int& operator[](const int i) { + HOST_DEVICE sitexyzt(int x, int y, int z, int t) : x(x), y(y), z(z), t(t) {}; + HOST_DEVICE inline int& operator[](const int i) { if(i == 0) return x; if(i == 1) return y; if(i == 2) return z; @@ -57,20 +58,20 @@ struct gSite { sitexyzt coord, coordFull; // These constructors should only be called from GIndexer. - __device__ __host__ inline gSite() : isite(0), isiteFull(0), coord(0, 0, 0, 0), coordFull(0, 0, 0, 0) {} + HOST_DEVICE inline gSite() : isite(0), isiteFull(0), coord(0, 0, 0, 0), coordFull(0, 0, 0, 0) {} - __device__ __host__ inline gSite(size_t isite, size_t isiteFull, sitexyzt coord, sitexyzt coordFull) : + HOST_DEVICE inline gSite(size_t isite, size_t isiteFull, sitexyzt coord, sitexyzt coordFull) : isite(isite), isiteFull(isiteFull), coord(coord), coordFull(coordFull) {}; - __host__ friend inline std::ostream &operator << (std::ostream &s, const gSite &site) { + HOST friend inline std::ostream &operator << (std::ostream &s, const gSite &site) { s << "gSite: coord: " << site.coord.x << " " << site.coord.y << " " << site.coord.z << " " << site.coord.t << " " << "coordFull: " << site.coordFull.x << " " << site.coordFull.y << " " << site.coordFull.z << " " << site.coordFull.t << " " << "isite: " << site.isite << " isiteFull: " << site.isiteFull; return s; } - __host__ inline std::string getStr() { + HOST inline std::string getStr() { std::ostringstream s; s << "gSite: coord: " << coord.x << " " << coord.y << " " << coord.z << " " << coord.t << " " << "coordFull: " << coordFull.x << " " << coordFull.y << " " << coordFull.z << " " << coordFull.t << " " @@ -84,18 +85,18 @@ struct gSiteStack : public gSite { size_t isiteStackFull; size_t stack; - __device__ __host__ gSiteStack() : gSite(), isiteStack(0), isiteStackFull(0), stack(0){} + HOST_DEVICE gSiteStack() : gSite(), isiteStack(0), isiteStackFull(0), stack(0){} - __device__ __host__ gSiteStack(size_t isite, size_t isiteFull, sitexyzt coord, + HOST_DEVICE gSiteStack(size_t isite, size_t isiteFull, sitexyzt coord, sitexyzt coordFull, size_t isiteStack, size_t isiteStackFull, size_t stack) : gSite(isite, isiteFull, coord, coordFull), isiteStack(isiteStack), isiteStackFull(isiteStackFull), stack(stack){} - __device__ __host__ gSiteStack(gSite site, size_t isiteStack, size_t isiteStackFull, size_t stack) : + HOST_DEVICE gSiteStack(gSite site, size_t isiteStack, size_t isiteStackFull, size_t stack) : gSite(site), isiteStack(isiteStack), isiteStackFull(isiteStackFull), stack(stack){} gSiteStack(const gSite) = delete; - __host__ friend inline std::ostream &operator << (std::ostream &s, const gSiteStack &site) { + HOST friend inline std::ostream &operator << (std::ostream &s, const gSiteStack &site) { s << "gSiteStack: coord: " << site.coord.x << " " << site.coord.y << " " << site.coord.z << " " << site.coord.t << " " << " coordFull: " << site.coordFull.x << " " << site.coordFull.y << " " << site.coordFull.z << " " << site.coordFull.t << " " << " isite: " << site.isite << " isiteFull: " << site.isiteFull << " stack: " << site.stack @@ -109,19 +110,19 @@ struct gSiteMu : public gSite { // Link direction. uint8_t mu; - __device__ __host__ gSiteMu() : gSite(), indexMuFull(0), mu(0){} + HOST_DEVICE gSiteMu() : gSite(), indexMuFull(0), mu(0){} - __device__ __host__ gSiteMu(size_t isite, size_t isiteFull, sitexyzt coord, + HOST_DEVICE gSiteMu(size_t isite, size_t isiteFull, sitexyzt coord, sitexyzt coordFull, size_t indexMuFull, uint8_t mu) : gSite(isite, isiteFull, coord, coordFull), indexMuFull(indexMuFull), mu(mu){} - __device__ __host__ gSiteMu(gSite site, size_t indexMuFull, uint8_t mu) + HOST_DEVICE gSiteMu(gSite site, size_t indexMuFull, uint8_t mu) : gSite(site), indexMuFull(indexMuFull), mu(mu) {} gSiteMu(const gSite) = delete; gSiteMu(const gSiteStack) = delete; - __host__ friend inline std::ostream &operator << (std::ostream &s, const gSiteMu &site) { + HOST friend inline std::ostream &operator << (std::ostream &s, const gSiteMu &site) { s << "gSite: coord: " << site.coord.x << " " << site.coord.y << " " << site.coord.z << " " << site.coord.t << " " << "coordFull: " << site.coordFull.x << " " << site.coordFull.y << " " << site.coordFull.z << " " << site.coordFull.t << " " << "isite: " << site.isite @@ -133,14 +134,14 @@ struct gSiteMu : public gSite { }; //! you can use these print functions for debugging, but in production code they are unused: -__attribute__((unused)) void __host__ __device__ inline printGSite(const gSite& site) { +__attribute__((unused)) void HOST_DEVICE inline printGSite(const gSite& site) { printf("Coord: %d %d %d %d, coordFull: %d %d %d %d, isite: %lu, isiteFull %lu\n", site.coord.x, site.coord.y, site.coord.z, site.coord.t, site.coordFull.x, site.coordFull.y, site.coordFull.z, site.coordFull.t, site.isite, site.isiteFull); } -__attribute__((unused)) void __host__ __device__ inline printGSiteStack(const gSiteStack& site) { +__attribute__((unused)) void HOST_DEVICE inline printGSiteStack(const gSiteStack& site) { printf("Coord: %d %d %d %d, coordFull: %d %d %d %d, isite: %lu, isiteFull %lu, stack: %lu, isiteStack: %lu, isiteStackFull %lu\n", site.coord.x, site.coord.y, site.coord.z, site.coord.t, site.coordFull.x, site.coordFull.y, site.coordFull.z, site.coordFull.t, @@ -150,7 +151,7 @@ __attribute__((unused)) void __host__ __device__ inline printGSiteStack(const g site.isiteStack, site.isiteStackFull); } -__attribute__((unused)) void __host__ __device__ inline printGSiteStack(const gSiteMu& site){ +__attribute__((unused)) void HOST_DEVICE inline printGSiteStack(const gSiteMu& site){ printf("Coord: %d %d %d %d, coordFull: %d %d %d %d, isite: %lu, isiteFull %lu, mu: %d, indexMu_Full: %lu\n", site.coord.x, site.coord.y, site.coord.z, site.coord.t, site.coordFull.x, site.coordFull.y, site.coordFull.z, site.coordFull.t, @@ -181,7 +182,7 @@ struct LatticeData { LatticeData() {} - __host__ __device__ LatticeData(size_t _lx, size_t _ly, size_t _lz, size_t _lt, size_t _HaloDepth, unsigned int _Nodes[4], + HOST_DEVICE LatticeData(size_t _lx, size_t _ly, size_t _lz, size_t _lt, size_t _HaloDepth, unsigned int _Nodes[4], size_t _globX, size_t _globY, size_t _globZ, size_t _globT, size_t _gPosX, size_t _gPosY, size_t _gPosZ, size_t _gPosT) : @@ -224,7 +225,7 @@ struct LatticeData { gPosZ(_gPosZ), gPosT(_gPosT) {} - __device__ __host__ sitexyzt globalPos(sitexyzt n) { + HOST_DEVICE sitexyzt globalPos(sitexyzt n) { sitexyzt coord = sitexyzt(gPosX + n.x,gPosY + n.y,gPosZ + n.z,gPosT + n.t); @@ -236,7 +237,7 @@ struct LatticeData { return coord; } - __device__ __host__ bool isLocal(sitexyzt globalsite){ + HOST_DEVICE bool isLocal(sitexyzt globalsite){ //! make sure globalsite is valid, i.e. not negative or greater than lattice extents! // consider lattice 20 20 20 20 with split 2 2 1 1 @@ -264,7 +265,7 @@ struct LatticeData { return false; } - __host__ LatticeDimensions globalPos(LatticeDimensions n) { + HOST LatticeDimensions globalPos(LatticeDimensions n) { LatticeDimensions coord = LatticeDimensions(gPosX,gPosY,gPosZ,gPosT) + n; @@ -276,25 +277,27 @@ struct LatticeData { return coord; } - __host__ LatticeDimensions globalLattice() { + HOST LatticeDimensions globalLattice() { return LatticeDimensions(globLX,globLY,globLZ,globLT); } - __host__ LatticeDimensions localLattice() { + HOST LatticeDimensions localLattice() { return LatticeDimensions(lx,ly,lz,lt); } - __device__ __host__ sitexyzt globalLatticeXYZT() { + HOST_DEVICE sitexyzt globalLatticeXYZT() { return sitexyzt(globLX,globLY,globLZ,globLT); } }; -extern __device__ __constant__ struct LatticeData globLatDataGPU[MAXHALO + 1]; + +extern DEVICE CONSTANT struct LatticeData globLatDataGPU[MAXHALO + 1]; extern struct LatticeData globLatDataCPU[MAXHALO + 1]; /// --------------------------------------------------------------------------------------------- INDEXER INITIALIZATION + void initGPUBulkIndexer(size_t lx, size_t ly, size_t lz, size_t lt, sitexyzt globCoord, sitexyzt globPos, unsigned int Nodes[4]); void initCPUBulkIndexer(size_t lx, size_t ly, size_t lz, size_t lt, sitexyzt globCoord, sitexyzt globPos, unsigned int Nodes[4]); void initGPUHaloIndexer(size_t lx, size_t ly, size_t lz, size_t lt, unsigned int Nodes[4], unsigned int Halos[4]); @@ -310,8 +313,8 @@ void initIndexer(const size_t HaloDepth, const LatticeParameters ¶m, Communi template class GIndexer { public: - __device__ __host__ GIndexer() = default; - __device__ __host__ inline static LatticeData getLatData() { + HOST_DEVICE GIndexer() = default; + HOST_DEVICE inline static LatticeData getLatData() { #ifdef __GPU_ARCH__ return globLatDataGPU[HaloDepth]; @@ -322,7 +325,7 @@ class GIndexer { /// ---------------------------------------------------------------------------------------------------- getSite* /// BULK (NO HALOS) - __device__ __host__ inline static gSite getSite(size_t isite) { + HOST_DEVICE inline static gSite getSite(size_t isite) { sitexyzt coord(0, 0, 0, 0); sitexyzt coordFull(0, 0, 0, 0); size_t isiteFull = 0; @@ -341,10 +344,10 @@ class GIndexer { } return gSite(isite, isiteFull, coord, coordFull); } - __device__ __host__ inline static gSite getSite(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { + HOST_DEVICE inline static gSite getSite(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { return getSite(_blockDim.x * _blockIdx.x + _threadIdx.x); } - __device__ __host__ inline static gSite getSite(int x, int y, int z, int t) { + HOST_DEVICE inline static gSite getSite(int x, int y, int z, int t) { sitexyzt coord = sitexyzt(x, y, z, t); sitexyzt coordFull = coordToFullCoord(coord); size_t isite = 0; @@ -359,7 +362,7 @@ class GIndexer { } return gSite(isite, isiteFull, coord, coordFull); } - __device__ __host__ inline static gSite getSite(sitexyzt coord) { + HOST_DEVICE inline static gSite getSite(sitexyzt coord) { return getSite(coord.x,coord.y,coord.z,coord.t); } @@ -367,7 +370,7 @@ class GIndexer { happen whenever you call a kernel running over spacelike indices only. All coordinates will be of the form (x, y, z, 0). The indices isite and isiteFull will by bounded by their respective 3-volumes. The indexing needs to change, because there are fewer sites than with the full bulk.*/ - __device__ __host__ inline static gSite getSiteSpatial(size_t isite) { + HOST_DEVICE inline static gSite getSiteSpatial(size_t isite) { sitexyzt coord(0, 0, 0, 0); sitexyzt coordFull(0, 0, 0, 0); size_t isiteFull = 0; @@ -386,10 +389,11 @@ class GIndexer { } return gSite(isite, isiteFull, coord, coordFull); } - __device__ __host__ inline static gSite getSiteSpatial(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { + + HOST_DEVICE inline static gSite getSiteSpatial(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { return getSiteSpatial(_blockDim.x * _blockIdx.x + _threadIdx.x); } - __device__ __host__ inline static gSite getSiteSpatial(int x, int y, int z, int t) { + HOST_DEVICE inline static gSite getSiteSpatial(int x, int y, int z, int t) { // There is probably a way to allow t>0. My worry right now is that there is that if you allow // t>0, there is no longer a one-to-one correspondence between isite and coord. sitexyzt coord = sitexyzt(x, y, z, t); @@ -407,7 +411,7 @@ class GIndexer { } /// FULL (WITH HALOS) - __device__ __host__ inline static gSite getSiteFull(size_t isiteFull) { + HOST_DEVICE inline static gSite getSiteFull(size_t isiteFull) { sitexyzt coord(0, 0, 0, 0); sitexyzt coordFull(0, 0, 0, 0); size_t isite = 0; @@ -426,10 +430,12 @@ class GIndexer { } return gSite(isite, isiteFull, coord, coordFull); } - __device__ __host__ inline static gSite getSiteFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { + + HOST_DEVICE inline static gSite getSiteFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { return getSiteFull(_blockDim.x * _blockIdx.x + _threadIdx.x); } - __device__ __host__ inline static gSite getSiteFull(int x, int y, int z, int t) { + + HOST_DEVICE inline static gSite getSiteFull(int x, int y, int z, int t) { sitexyzt coordFull = sitexyzt(x, y, z, t); sitexyzt coord = fullCoordToCoord(coordFull); size_t isite = 0; @@ -443,11 +449,11 @@ class GIndexer { } return gSite(isite, isiteFull, coord, coordFull); } - __device__ __host__ inline static gSite getSiteFull(sitexyzt coordfull) { + HOST_DEVICE inline static gSite getSiteFull(sitexyzt coordfull) { return getSiteFull(coordfull.x,coordfull.y,coordfull.z,coordfull.t); } - __device__ __host__ inline static gSite getSiteSpatialFull(size_t isiteFull) { + HOST_DEVICE inline static gSite getSiteSpatialFull(size_t isiteFull) { sitexyzt coord(0, 0, 0, 0); sitexyzt coordFull(0, 0, 0, 0); size_t isite = 0; @@ -466,10 +472,11 @@ class GIndexer { } return gSite(isite, isiteFull, coord, coordFull); } - __device__ __host__ inline static gSite getSiteSpatialFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { + + HOST_DEVICE inline static gSite getSiteSpatialFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { return getSiteSpatialFull(_blockDim.x * _blockIdx.x + _threadIdx.x); } - __device__ __host__ inline static gSite getSiteSpatialFull(int x, int y, int z, int t) { + HOST_DEVICE inline static gSite getSiteSpatialFull(int x, int y, int z, int t) { sitexyzt coordFull = sitexyzt(x, y, z, t); sitexyzt coord = fullCoordToCoord(coordFull); size_t isite = 0; @@ -488,62 +495,62 @@ class GIndexer { /// BULK (NO HALOS) //! two helper functions for getSiteMu* - __device__ __host__ inline static size_t coordMuToIndexMu_Full(const int x, const int y, const int z, const int t, const int mu) { + HOST_DEVICE inline static size_t coordMuToIndexMu_Full(const int x, const int y, const int z, const int t, const int mu) { return (((x + y*getLatData().vol1Full + z*getLatData().vol2Full + t*getLatData().vol3Full) >> 0x1) // integer division by two +getLatData().sizehFull*((x + y + z + t) & 0x1) // 0 if x+y+z+t is even, 1 if it is odd + mu*getLatData().vol4Full); } - __device__ __host__ inline static size_t indexMu_Full(const gSite site, const int mu) { + HOST_DEVICE inline static size_t indexMu_Full(const gSite site, const int mu) { return coordMuToIndexMu_Full(site.coordFull.x, site.coordFull.y, site.coordFull.z, site.coordFull.t, mu); } - __device__ __host__ inline static gSiteMu getSiteMu(size_t isite, size_t mu) { + HOST_DEVICE inline static gSiteMu getSiteMu(size_t isite, size_t mu) { gSite site(getSite(isite)); size_t indexmufull = indexMu_Full(site, mu); return gSiteMu(site, indexmufull, mu); } - __device__ __host__ inline static gSiteMu getSiteMu(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, size_t mu){ + HOST_DEVICE inline static gSiteMu getSiteMu(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, size_t mu){ return getSiteMu(_blockDim.x * _blockIdx.x + _threadIdx.x, mu); } - __device__ __host__ inline static gSiteMu getSiteMu(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ + HOST_DEVICE inline static gSiteMu getSiteMu(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ //! It gets the mu index from the y direction of the block. return getSiteMu(_blockDim.x * _blockIdx.x + _threadIdx.x, _threadIdx.y); } - __device__ __host__ inline static gSiteMu getSiteMu(gSite site, size_t mu) { + HOST_DEVICE inline static gSiteMu getSiteMu(gSite site, size_t mu) { size_t indexmufull = indexMu_Full(site, mu); return gSiteMu(site, indexmufull, mu); } - __device__ __host__ inline static gSiteMu getSiteMu(int x, int y, int z, int t, size_t mu){ + HOST_DEVICE inline static gSiteMu getSiteMu(int x, int y, int z, int t, size_t mu){ return getSiteMu(getSite(x, y, z, t), mu); } - __device__ __host__ inline static gSiteMu getSiteSpatialMu(size_t isite, size_t mu) { + HOST_DEVICE inline static gSiteMu getSiteSpatialMu(size_t isite, size_t mu) { gSite site(getSiteSpatial(isite)); size_t indexmufull = indexMu_Full(site, mu); return gSiteMu(site, indexmufull, mu); } /// FULL (WITH HALOS) - __device__ __host__ inline static gSiteMu getSiteMuFull(size_t isiteFull, size_t mu) { + HOST_DEVICE inline static gSiteMu getSiteMuFull(size_t isiteFull, size_t mu) { gSite site(getSiteFull(isiteFull)); size_t indexmufull = indexMu_Full(site, mu); return gSiteMu(site, indexmufull, mu); } - __device__ __host__ inline static gSiteMu getSiteMuFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, size_t mu){ + HOST_DEVICE inline static gSiteMu getSiteMuFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, size_t mu){ return getSiteMuFull(_blockDim.x * _blockIdx.x + _threadIdx.x, mu); } - __device__ __host__ inline static gSiteMu getSiteMuFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ + HOST_DEVICE inline static gSiteMu getSiteMuFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ //!get the mu index from the y direction of the block. return getSiteMuFull(_blockDim.x * _blockIdx.x + _threadIdx.x, _threadIdx.y); } - __device__ __host__ inline static gSiteMu getSiteMuFull(int x, int y, int z, int t, size_t mu){ + HOST_DEVICE inline static gSiteMu getSiteMuFull(int x, int y, int z, int t, size_t mu){ return getSiteMu(getSiteFull(x, y, z, t), mu); } /// --------------------------------------------------------------------------------------------------- getSiteStack /// BULK (NO HALOS) - __device__ __host__ inline static gSiteStack getSiteStack(const gSite& site, const size_t stack){ + HOST_DEVICE inline static gSiteStack getSiteStack(const gSite& site, const size_t stack){ size_t isiteStack; size_t isiteStackFull; if (LatLayout == All) { @@ -556,23 +563,24 @@ class GIndexer { gSiteStack ret(site, isiteStack, isiteStackFull, stack); return ret; } - __device__ __host__ inline static gSiteStack getSiteStack(const size_t isite, const size_t stack){ + HOST_DEVICE inline static gSiteStack getSiteStack(const size_t isite, const size_t stack){ return getSiteStack(getSite(isite), stack); } - __device__ __host__ inline static gSiteStack getSiteStack(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ + + HOST_DEVICE inline static gSiteStack getSiteStack(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ return getSiteStack(_blockDim.x * _blockIdx.x + _threadIdx.x, _threadIdx.y); } - __device__ __host__ inline static gSiteStack getSiteStack(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, const size_t stack){ + HOST_DEVICE inline static gSiteStack getSiteStack(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, const size_t stack){ return getSiteStack(_blockDim.x * _blockIdx.x + _threadIdx.x, stack); } - __device__ __host__ inline static gSiteStack getSiteStack(int x, int y, int z, int t, int stack) { + HOST_DEVICE inline static gSiteStack getSiteStack(int x, int y, int z, int t, int stack) { return getSiteStack(getSite(x, y, z, t), stack); } - __device__ __host__ inline static gSiteStack getSiteStack(sitexyzt coord, int stack) { + HOST_DEVICE inline static gSiteStack getSiteStack(sitexyzt coord, int stack) { return getSiteStack(getSite(coord.x, coord.y, coord.z, coord.t), stack); } - __device__ __host__ inline static gSiteStack getSiteStackOdd(const gSite& site, const size_t stack){ + HOST_DEVICE inline static gSiteStack getSiteStackOdd(const gSite& site, const size_t stack){ size_t isiteStack; size_t isiteStackFull; if (LatLayout == All) { @@ -585,59 +593,60 @@ class GIndexer { gSiteStack ret(site, isiteStack, isiteStackFull, stack); return ret; } - __device__ __host__ inline static gSiteStack getSiteStackOdd(const size_t isite, const size_t stack){ + HOST_DEVICE inline static gSiteStack getSiteStackOdd(const size_t isite, const size_t stack){ return getSiteStackOdd(getSite(isite), stack); } - __device__ __host__ inline static gSiteStack getSiteStackOdd(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ + + HOST_DEVICE inline static gSiteStack getSiteStackOdd(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ return getSiteStackOdd(_blockDim.x * _blockIdx.x + _threadIdx.x, _threadIdx.y); } - __device__ __host__ inline static gSiteStack getSiteStackOdd(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, const size_t stack){ + HOST_DEVICE inline static gSiteStack getSiteStackOdd(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, const size_t stack){ return getSiteStackOdd(_blockDim.x * _blockIdx.x + _threadIdx.x, stack); } /// FULL (WITH HALOS) - __device__ __host__ inline static gSiteStack getSiteStackFull(const size_t isiteFull, const size_t stack){ + HOST_DEVICE inline static gSiteStack getSiteStackFull(const size_t isiteFull, const size_t stack){ return getSiteStack(getSiteFull(isiteFull), stack); } - __device__ __host__ inline static gSiteStack getSiteStackFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ + HOST_DEVICE inline static gSiteStack getSiteStackFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ gSiteStack ret = getSiteStackFull(_blockDim.x * _blockIdx.x + _threadIdx.x, _threadIdx.y); return ret; } - __device__ __host__ inline static gSiteStack getSiteStackFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, const size_t stack){ + HOST_DEVICE inline static gSiteStack getSiteStackFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, const size_t stack){ gSiteStack ret = getSiteStackFull(_blockDim.x * _blockIdx.x + _threadIdx.x, stack); return ret; } - __device__ __host__ inline static gSiteStack getSiteStackFull(int x, int y, int z, int t, int stack) { + HOST_DEVICE inline static gSiteStack getSiteStackFull(int x, int y, int z, int t, int stack) { return getSiteStack(getSiteFull(x, y, z, t), stack); } /// ----------------------------------------------------------------------------------- CONVERT BETWEEN EVEN AND ODD - template __device__ __host__ inline static gSite convertSite(const gSite& site){ + template HOST_DEVICE inline static gSite convertSite(const gSite& site){ return GIndexer::getSite(site.coord.x, site.coord.y, site.coord.z, site.coord.t); } - template __device__ __host__ inline static gSiteMu convertSite(const gSiteMu& site){ + template HOST_DEVICE inline static gSiteMu convertSite(const gSiteMu& site){ return GIndexer::getSiteMu(site.coord.x, site.coord.y, site.coord.z, site.coord.t, site.mu); } - template __device__ __host__ inline static gSiteStack convertSite(const gSiteStack& site){ + template HOST_DEVICE inline static gSiteStack convertSite(const gSiteStack& site){ return GIndexer::getSiteStack(site.coord.x, site.coord.y, site.coord.z, site.coord.t, site.stack); } //! Given an Even/Odd gSite object, this returns an All gSite object. - __device__ __host__ inline static gSite convertToAll(gSite& site) { + HOST_DEVICE inline static gSite convertToAll(gSite& site) { size_t isite = site.isite + (LatLayout == Odd)*getLatData().sizeh; size_t isiteFull = site.isiteFull + (LatLayout == Odd)*getLatData().sizehFull; return gSite(isite, isiteFull, site.coord, site.coordFull); } /// ------------------------------------------------ CONVERT BETWEEN BULK SPACETIME COORDINATES AND FULL COORDINATES - __device__ __host__ inline static sitexyzt coordToFullCoord(sitexyzt coord) { + HOST_DEVICE inline static sitexyzt coordToFullCoord(sitexyzt coord) { coord.x += getLatData().HaloDepth[0]; coord.y += getLatData().HaloDepth[1]; coord.z += getLatData().HaloDepth[2]; coord.t += getLatData().HaloDepth[3]; return coord; } - __device__ __host__ inline static sitexyzt fullCoordToCoord(sitexyzt fullCoord) { + HOST_DEVICE inline static sitexyzt fullCoordToCoord(sitexyzt fullCoord) { fullCoord.x -= getLatData().HaloDepth[0]; fullCoord.y -= getLatData().HaloDepth[1]; fullCoord.z -= getLatData().HaloDepth[2]; @@ -645,7 +654,7 @@ class GIndexer { return fullCoord; } - __device__ __host__ inline static sitexyzt globalCoordToLocalCoord(sitexyzt coord) { + HOST_DEVICE inline static sitexyzt globalCoordToLocalCoord(sitexyzt coord) { coord.x -= getLatData().gPosX; coord.y -= getLatData().gPosY; coord.z -= getLatData().gPosZ; @@ -655,47 +664,47 @@ class GIndexer { /// -------------------------------------------------------------------- CONVERT SPACETIME COORDINATES TO DATA INDEX /// BULK (NO HALOS) - __device__ __host__ inline static size_t coordToIndex_Bulk(const sitexyzt coord) { + HOST_DEVICE inline static size_t coordToIndex_Bulk(const sitexyzt coord) { return (((coord.x + coord.y*getLatData().vol1 + coord.z*getLatData().vol2 + coord.t*getLatData().vol3) >> 0x1) // integer division by two +getLatData().sizeh * ((coord.x + coord.y + coord.z + coord.t) & 0x1)); // 0 if x+y+z+t is even, 1 if it is odd } - __device__ __host__ inline static size_t coordToIndex_Bulk_eo(const sitexyzt coord) { + HOST_DEVICE inline static size_t coordToIndex_Bulk_eo(const sitexyzt coord) { return ((coord.x + coord.y*getLatData().vol1 + coord.z*getLatData().vol2 + coord.t*getLatData().vol3) >> 0x1); } - __device__ __host__ inline static size_t coordToIndex_SpatialBulk(const sitexyzt coord) { + HOST_DEVICE inline static size_t coordToIndex_SpatialBulk(const sitexyzt coord) { return (((coord.x + coord.y*getLatData().vol1 + coord.z*getLatData().vol2) >> 0x1) + getLatData().vol3h*((coord.x + coord.y + coord.z) & 0x1)); } - __device__ __host__ inline static size_t coordToIndex_SpatialBulk_eo(const sitexyzt coord) { + HOST_DEVICE inline static size_t coordToIndex_SpatialBulk_eo(const sitexyzt coord) { return ((coord.x + coord.y*getLatData().vol1 + coord.z*getLatData().vol2) >> 0x1); } /// FULL (WITH HALOS) - __device__ __host__ inline static size_t coordToIndex_Full(const sitexyzt coordFull) { + HOST_DEVICE inline static size_t coordToIndex_Full(const sitexyzt coordFull) { return (((coordFull.x + coordFull.y*getLatData().vol1Full + coordFull.z*getLatData().vol2Full + coordFull.t*getLatData().vol3Full) >> 0x1) + getLatData().sizehFull*((coordFull.x + coordFull.y + coordFull.z + coordFull.t) & 0x1)); } - __device__ __host__ inline static size_t coordToIndex_Full_eo(const sitexyzt coordFull) { + HOST_DEVICE inline static size_t coordToIndex_Full_eo(const sitexyzt coordFull) { return ((coordFull.x + coordFull.y * getLatData().vol1Full + coordFull.z * getLatData().vol2Full + coordFull.t * getLatData().vol3Full) >> 0x1); } - __device__ __host__ inline static size_t coordToIndex_SpatialFull(const sitexyzt coordFull) { + HOST_DEVICE inline static size_t coordToIndex_SpatialFull(const sitexyzt coordFull) { return (((coordFull.x + coordFull.y*getLatData().vol1Full + coordFull.z*getLatData().vol2Full) >> 0x1) + getLatData().vol3hFull*((coordFull.x + coordFull.y + coordFull.z) & 0x1)); } - __device__ __host__ inline static size_t coordToIndex_SpatialFull_eo(const sitexyzt coordFull) { + HOST_DEVICE inline static size_t coordToIndex_SpatialFull_eo(const sitexyzt coordFull) { return ((coordFull.x + coordFull.y*getLatData().vol1Full + coordFull.z*getLatData().vol2Full) >> 0x1); } - __host__ inline static size_t localCoordToGlobalIndex(LatticeDimensions coord) { + HOST inline static size_t localCoordToGlobalIndex(LatticeDimensions coord) { LatticeData lat = GIndexer::getLatData(); LatticeDimensions globCoord = lat.globalPos(coord); return (globCoord[0] + globCoord[1] * lat.globLX + globCoord[2] * lat.globLX * lat.globLY + @@ -704,7 +713,7 @@ class GIndexer { /// -------------------------------------------------------------------- CONVERT DATA INDEX TO SPACETIME COORDINATES /// BULK (NO HALOS) - __device__ __host__ inline static sitexyzt indexToCoord(const size_t site) { + HOST_DEVICE inline static sitexyzt indexToCoord(const size_t site) { int x, y, z, t; int par, normInd, tmp; @@ -742,7 +751,7 @@ class GIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt indexToCoord_eo(const size_t site, int par) { + HOST_DEVICE inline static sitexyzt indexToCoord_eo(const size_t site, int par) { int x, y, z, t; int tmp; // double site @@ -760,7 +769,7 @@ class GIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt indexToCoord_Spatial(const size_t site) { + HOST_DEVICE inline static sitexyzt indexToCoord_Spatial(const size_t site) { int x, y, z, t; int par, normInd, tmp; @@ -779,7 +788,7 @@ class GIndexer { return sitexyzt(x,y,z,t); } - __device__ __host__ inline static sitexyzt indexToCoord_Spatial_eo(const size_t site, int par) { + HOST_DEVICE inline static sitexyzt indexToCoord_Spatial_eo(const size_t site, int par) { int x, y, z, t; int tmp; size_t sited = site << 0x1; @@ -797,7 +806,7 @@ class GIndexer { return sitexyzt(x, y, z, t); } /// FULL (WITH HALOS) - __device__ __host__ inline static sitexyzt indexToCoord_Full(const size_t siteFull) { + HOST_DEVICE inline static sitexyzt indexToCoord_Full(const size_t siteFull) { int x, y, z, t; int par, normInd, tmp; @@ -817,7 +826,7 @@ class GIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt indexToCoord_SpatialFull(const size_t siteFull) { + HOST_DEVICE inline static sitexyzt indexToCoord_SpatialFull(const size_t siteFull) { int x, y, z, t; int par, normInd, tmp; @@ -837,7 +846,7 @@ class GIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt indexToCoord_Full_eo(const size_t siteFull, int par) { + HOST_DEVICE inline static sitexyzt indexToCoord_Full_eo(const size_t siteFull, int par) { int x, y, z, t; int tmp; @@ -854,7 +863,7 @@ class GIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt indexToCoord_SpatialFull_eo(const size_t siteFull, int par) { + HOST_DEVICE inline static sitexyzt indexToCoord_SpatialFull_eo(const size_t siteFull, int par) { int x, y, z, t; int tmp; @@ -875,7 +884,7 @@ class GIndexer { //! This function is needed when one wants to have the sites time ordered. For example if one wants to reduce only //! values on each timeslice. - __device__ __host__ inline static size_t siteTimeOrdered(const gSite &site) { + HOST_DEVICE inline static size_t siteTimeOrdered(const gSite &site) { sitexyzt c = site.coord; return c.x + c.y*getLatData().vol1 + c.z*getLatData().vol2 + c.t*getLatData().vol3; } @@ -887,19 +896,19 @@ class GIndexer { //! time, this means you cannot pass these functions a dynamic argument. /// --------------------------------------------------------------------------------------- site_move: ONE DIRECTION - template __device__ __host__ inline static gSite site_move(const gSite &s, const int mu) { + template HOST_DEVICE inline static gSite site_move(const gSite &s, const int mu) { sitexyzt tmp = site_move(s.coordFull, mu); return getSiteFull(tmp.x, tmp.y, tmp.z, tmp.t); } - template __device__ __host__ inline static gSiteMu site_move(const gSiteMu &s, const int mu) { + template HOST_DEVICE inline static gSiteMu site_move(const gSiteMu &s, const int mu) { sitexyzt tmp = site_move(s.coordFull, mu); return getSiteMuFull(tmp.x, tmp.y, tmp.z, tmp.t, s.mu); } - template __device__ __host__ inline static gSiteStack site_move(const gSiteStack &s, const int mu) { + template HOST_DEVICE inline static gSiteStack site_move(const gSiteStack &s, const int mu) { sitexyzt tmp = site_move(s.coordFull, mu); return getSiteStackFull(tmp.x, tmp.y, tmp.z, tmp.t, s.stack); } - template __device__ __host__ inline static sitexyzt site_move(sitexyzt s, const int mu) { + template HOST_DEVICE inline static sitexyzt site_move(sitexyzt s, const int mu) { int x = s.x; int y = s.y; @@ -966,19 +975,19 @@ class GIndexer { } /// -------------------------------------------------------------------------------------- site_move: TWO DIRECTIONS - template __device__ __host__ inline static gSite site_move(const gSite &s, const int mu, const int nu) { + template HOST_DEVICE inline static gSite site_move(const gSite &s, const int mu, const int nu) { sitexyzt tmp = site_move(s.coordFull, mu, nu); return getSiteFull(tmp.x, tmp.y, tmp.z, tmp.t); } - template __device__ __host__ inline static gSiteMu site_move(const gSiteMu &s, const int mu, const int nu) { + template HOST_DEVICE inline static gSiteMu site_move(const gSiteMu &s, const int mu, const int nu) { sitexyzt tmp = site_move(s.coordFull, mu, nu); return getSiteMuFull(tmp.x, tmp.y, tmp.z, tmp.t, s.mu); } - template __device__ __host__ inline static gSiteStack site_move(const gSiteStack &s, const int mu, const int nu) { + template HOST_DEVICE inline static gSiteStack site_move(const gSiteStack &s, const int mu, const int nu) { sitexyzt tmp = site_move(s.coordFull, mu, nu); return getSiteStackFull(tmp.x, tmp.y, tmp.z, tmp.t, s.stack); } - template __device__ __host__ inline static sitexyzt site_move(const sitexyzt &s, const int mu, const int nu) { + template HOST_DEVICE inline static sitexyzt site_move(const sitexyzt &s, const int mu, const int nu) { int x = s.x; int y = s.y; int z = s.z; @@ -1101,22 +1110,22 @@ class GIndexer { /// ------------------------------------------------------------------------------------ site_move: THREE DIRECTIONS template - __device__ __host__ inline static gSite site_move(const gSite &s, const int mu, const int nu, const int rho) { + HOST_DEVICE inline static gSite site_move(const gSite &s, const int mu, const int nu, const int rho) { sitexyzt tmp = site_move(s.coordFull, mu, nu, rho); return getSiteFull(tmp.x, tmp.y, tmp.z, tmp.t); } template - __device__ __host__ inline static gSiteMu site_move(const gSiteMu &s, const int mu, const int nu, const int rho) { + HOST_DEVICE inline static gSiteMu site_move(const gSiteMu &s, const int mu, const int nu, const int rho) { sitexyzt tmp = site_move(s.coordFull, mu, nu, rho); return getSiteMuFull(tmp.x, tmp.y, tmp.z, tmp.t, s.mu); } template - __device__ __host__ inline static gSiteStack site_move(const gSiteStack &s, const int mu, const int nu, const int rho) { + HOST_DEVICE inline static gSiteStack site_move(const gSiteStack &s, const int mu, const int nu, const int rho) { sitexyzt tmp = site_move(s.coordFull, mu, nu, rho); return getSiteStackFull(tmp.x, tmp.y, tmp.z, tmp.t, s.stack); } template - __device__ __host__ inline static sitexyzt site_move(const sitexyzt &s, const int mu, const int nu, const int rho) { + HOST_DEVICE inline static sitexyzt site_move(const sitexyzt &s, const int mu, const int nu, const int rho) { int x = s.x; int y = s.y; int z = s.z; @@ -1295,22 +1304,22 @@ class GIndexer { /// ------------------------------------------------------------------------------------- site_move: FOUR DIRECTIONS template - __device__ __host__ inline static gSite site_move(const gSite &s, const int mu, const int nu, const int rho, const int sig) { + HOST_DEVICE inline static gSite site_move(const gSite &s, const int mu, const int nu, const int rho, const int sig) { sitexyzt tmp = site_move(s.coordFull, mu, nu, rho, sig); return getSiteFull(tmp.x, tmp.y, tmp.z, tmp.t); } template - __device__ __host__ inline static gSiteMu site_move(const gSiteMu &s, const int mu, const int nu, const int rho, const int sig) { + HOST_DEVICE inline static gSiteMu site_move(const gSiteMu &s, const int mu, const int nu, const int rho, const int sig) { sitexyzt tmp = site_move(s.coordFull, mu, nu, rho, sig); return getSiteMuFull(tmp.x, tmp.y, tmp.z, tmp.t, s.mu); } template - __device__ __host__ inline static gSiteStack site_move(const gSiteStack &s, const int mu, const int nu, const int rho, const int sig) { + HOST_DEVICE inline static gSiteStack site_move(const gSiteStack &s, const int mu, const int nu, const int rho, const int sig) { sitexyzt tmp = site_move(s.coordFull, mu, nu, rho, sig); return getSiteStackFull(tmp.x, tmp.y, tmp.z, tmp.t, s.stack); } template - __device__ __host__ inline static sitexyzt site_move(const sitexyzt &s, const int mu, const int nu, const int rho, const int sig) { + HOST_DEVICE inline static sitexyzt site_move(const sitexyzt &s, const int mu, const int nu, const int rho, const int sig) { int x = s.x; int y = s.y; int z = s.z; @@ -1544,55 +1553,55 @@ class GIndexer { } /// ------------------------------------------------------------------------------------------------ site_up and site_dn - template __device__ __host__ inline static T site_up(const T &s, const int mu) { + template HOST_DEVICE inline static T site_up(const T &s, const int mu) { return site_move<1>(s, mu); } - template __device__ __host__ inline static T site_dn(const T &s, const int mu) { + template HOST_DEVICE inline static T site_dn(const T &s, const int mu) { return site_move<-1>(s, mu); } - template __device__ __host__ inline static T site_up_up(const T &s, const int mu, const int nu) { + template HOST_DEVICE inline static T site_up_up(const T &s, const int mu, const int nu) { #ifdef __GPU_ARCH__ return site_move<1, 1>(s, mu, nu); #else return site_up(site_up(s, mu), nu); #endif } - template __device__ __host__ inline static T site_up_dn(const T &s, const int mu, const int nu) { + template HOST_DEVICE inline static T site_up_dn(const T &s, const int mu, const int nu) { #ifdef __GPU_ARCH__ return site_move<1, -1>(s, mu, nu); #else return site_dn(site_up(s, mu), nu); #endif } - template __device__ __host__ inline static T site_dn_dn(const T &s, const int mu, const int nu) { + template HOST_DEVICE inline static T site_dn_dn(const T &s, const int mu, const int nu) { #ifdef __GPU_ARCH__ return site_move<-1, -1>(s, mu, nu); #else return site_dn(site_dn(s, mu), nu); #endif } - template __device__ __host__ inline static T site_up_up_up(const T &s, const int mu, const int nu, const int rho) { + template HOST_DEVICE inline static T site_up_up_up(const T &s, const int mu, const int nu, const int rho) { #ifdef __GPU_ARCH__ return site_move<1, 1, 1>(s, mu, nu, rho); #else return site_up(site_up_up(s, mu, nu), rho); #endif } - template __device__ __host__ inline static T site_up_up_dn(const T &s, const int mu, const int nu, const int rho) { + template HOST_DEVICE inline static T site_up_up_dn(const T &s, const int mu, const int nu, const int rho) { #ifdef __GPU_ARCH__ return site_move<1, 1, -1>(s, mu, nu, rho); #else return site_dn(site_up_up(s, mu, nu), rho); #endif } - template __device__ __host__ inline static T site_up_dn_dn(const T &s, const int mu, const int nu, const int rho) { + template HOST_DEVICE inline static T site_up_dn_dn(const T &s, const int mu, const int nu, const int rho) { #ifdef __GPU_ARCH__ return site_move<1, -1, -1>(s, mu, nu, rho); #else return site_dn(site_up_dn(s, mu, nu), rho); #endif } - template __device__ __host__ inline static T site_dn_dn_dn(const T &s, const int mu, const int nu, const int rho) { + template HOST_DEVICE inline static T site_dn_dn_dn(const T &s, const int mu, const int nu, const int rho) { #ifdef __GPU_ARCH__ return site_move<-1, -1, -1>(s, mu, nu, rho); #else @@ -1600,70 +1609,70 @@ class GIndexer { #endif } //! The following are currently unused but can be commented in if needed: - template __device__ __host__ inline static T site_up_up_up_up(const T &s, const int mu, const int nu, const int rho, const int sig) { + template HOST_DEVICE inline static T site_up_up_up_up(const T &s, const int mu, const int nu, const int rho, const int sig) { #ifdef __GPU_ARCH__ return site_move<1, 1, 1, 1>(s, mu, nu, rho, sig); #else return site_up(site_up_up_up(s, mu, nu, rho), sig); #endif } - template __device__ __host__ inline static T site_up_up_up_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { + template HOST_DEVICE inline static T site_up_up_up_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { #ifdef __GPU_ARCH__ return site_move<1, 1, 1, -1>(s, mu, nu, rho, sig); #else return site_dn(site_up_up_up(s, mu, nu, rho), sig); #endif } - template __device__ __host__ inline static T site_up_up_dn_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { + template HOST_DEVICE inline static T site_up_up_dn_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { #ifdef __GPU_ARCH__ return site_move<1, 1, -1, -1>(s, mu, nu, rho, sig); #else return site_dn(site_up_up_dn(s, mu, nu, rho), sig); #endif } - template __device__ __host__ inline static T site_up_dn_dn_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { + template HOST_DEVICE inline static T site_up_dn_dn_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { #ifdef __GPU_ARCH__ return site_move<1, -1, -1, -1>(s, mu, nu, rho, sig); #else return site_dn(site_up_dn_dn(s, mu, nu, rho), sig); #endif } - template __device__ __host__ inline static T site_dn_dn_dn_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { + template HOST_DEVICE inline static T site_dn_dn_dn_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { #ifdef __GPU_ARCH__ return site_move<-1, -1, -1, -1>(s, mu, nu, rho, sig); #else return site_dn(site_dn_dn_dn(s, mu, nu, rho), sig); #endif } - template __device__ __host__ inline static T site_2up_up(const T &s, const int mu, const int nu) { + template HOST_DEVICE inline static T site_2up_up(const T &s, const int mu, const int nu) { #ifdef __GPU_ARCH__ return site_move<2, 1>(s, mu, nu); #else return site_up_up_up(s, mu, mu, nu); #endif } - template __device__ __host__ inline static T site_2up_dn(const T &s, const int mu, const int nu) { + template HOST_DEVICE inline static T site_2up_dn(const T &s, const int mu, const int nu) { #ifdef __GPU_ARCH__ return site_move<2, -1>(s, mu, nu); #else return site_up_up_dn(s, mu, mu, nu); #endif } - template __device__ __host__ inline static T site_up_2dn(const T &s, const int mu, const int nu) { + template HOST_DEVICE inline static T site_up_2dn(const T &s, const int mu, const int nu) { #ifdef __GPU_ARCH__ return site_move<1, -2>(s, mu, nu); #else return site_up_dn_dn(s, mu, mu, nu); #endif } - template __device__ __host__ inline static T site_2up(const T &s, const int mu) { + template HOST_DEVICE inline static T site_2up(const T &s, const int mu) { #ifdef __GPU_ARCH__ return site_move<2>(s, mu); #else return site_up_up(s, mu, mu); #endif } - template __device__ __host__ inline static T site_2dn(const T &s, const int mu) { + template HOST_DEVICE inline static T site_2dn(const T &s, const int mu) { #ifdef __GPU_ARCH__ return site_move<-2>(s, mu); #else @@ -1675,7 +1684,7 @@ class GIndexer { //! Unlike the above implementation of site_move, this can be used in a for loop. Presumably it is slower? //! Currently unused but can be commented in if needed: - __device__ __host__ inline static sitexyzt dynamic_move(sitexyzt s, const int mu, int mu_steps) { + HOST_DEVICE inline static sitexyzt dynamic_move(sitexyzt s, const int mu, int mu_steps) { int x = s.x; int y = s.y; int z = s.z; @@ -1738,7 +1747,7 @@ class GIndexer { } return sitexyzt(x, y, z, t); } - __attribute__((unused)) __device__ __host__ inline static gSite dynamic_move(const gSite &s, const int mu, int mu_steps) { + __attribute__((unused)) HOST_DEVICE inline static gSite dynamic_move(const gSite &s, const int mu, int mu_steps) { sitexyzt tmp = dynamic_move(s.coordFull, mu, mu_steps); return getSiteFull(tmp.x, tmp.y, tmp.z, tmp.t); } diff --git a/src/base/indexer/HaloIndexer.h b/src/base/indexer/HaloIndexer.h index df900d86..b0346706 100644 --- a/src/base/indexer/HaloIndexer.h +++ b/src/base/indexer/HaloIndexer.h @@ -8,6 +8,7 @@ #ifndef HALOINDEXER_H #define HALOINDEXER_H + #include "BulkIndexer.h" #include #include @@ -98,9 +99,9 @@ struct HaloData { size_t h_offsetsHalf[80]; - __device__ __host__ HaloData() {} + HOST_DEVICE HaloData() {} - __device__ __host__ HaloData(size_t lx, size_t ly, size_t lz, size_t lt, size_t halo_depth, unsigned int Nodes[4]) { + HOST_DEVICE HaloData(size_t lx, size_t ly, size_t lz, size_t lt, size_t halo_depth, unsigned int Nodes[4]) { h_HaloDepth[0] = Nodes[0] != 1 ? halo_depth : 0; @@ -207,7 +208,7 @@ struct HaloData { } - __device__ __host__ size_t getBufferSize(Layout LatLayout) { + HOST_DEVICE size_t getBufferSize(Layout LatLayout) { if (LatLayout == All)return h_summed_buffer[15]; else return h_summed_bufferHalf[15]; } @@ -217,7 +218,7 @@ struct HaloData { /// This function returns the size of these sub_Halos. /// The argument is the number of the Sub-Halo! - __device__ __host__ inline size_t get_SubHaloSize(const short number, Layout LatLayout) const { + HOST_DEVICE inline size_t get_SubHaloSize(const short number, Layout LatLayout) const { size_t EvenFactor = 1; if (LatLayout != All) EvenFactor = 2; @@ -244,7 +245,7 @@ struct HaloData { private: /// The argument is the number of the Halo Type! It returns the size of an All Halo Type! - __device__ __host__ inline size_t get_SubHaloSizeFromType(const short number) const { + HOST_DEVICE inline size_t get_SubHaloSizeFromType(const short number) const { if (number == 0) return h_YZTH; if (number == 1) return h_XZTH; if (number == 2) return h_XYTH; @@ -269,14 +270,15 @@ struct HaloData { }; - +#ifndef USE_CPU_ONLY extern __device__ __constant__ struct HaloData globHalDataGPU[MAXHALO + 1]; extern __device__ __constant__ struct HaloData globHalDataGPUReduced[MAXHALO + 1]; +#endif + extern struct HaloData globHalDataCPU[MAXHALO + 1]; extern struct HaloData globHalDataCPUReduced[MAXHALO + 1]; void initGPUHaloIndexer(size_t lx, size_t ly, size_t lz, size_t lt); - void initCPUHaloIndexer(size_t lx, size_t ly, size_t lz, size_t lt); template @@ -284,7 +286,7 @@ class HaloIndexer { private: - __device__ __host__ inline static size_t _getHaloNumber(size_t index, size_t *LocHalIndex) { + HOST_DEVICE inline static size_t _getHaloNumber(size_t index, size_t *LocHalIndex) { if (LatLayout == All) { for (int i = 1; i < 80; i++) { if (getHalData().h_offsets[i] > index) { @@ -312,7 +314,7 @@ class HaloIndexer { return 0; }; - __device__ __host__ inline static size_t _getHaloNumberReduced(size_t index, size_t *LocHalIndex) { + HOST_DEVICE inline static size_t _getHaloNumberReduced(size_t index, size_t *LocHalIndex) { if (LatLayout == All) { for (int i = 1; i < 80; i++) { if (getHalDataReduced().h_offsets[i] > index) { @@ -341,51 +343,51 @@ class HaloIndexer { }; public: - __device__ __host__ HaloIndexer(); + HOST_DEVICE HaloIndexer(); - __device__ __host__ ~HaloIndexer() {}; + HOST_DEVICE ~HaloIndexer() {}; - __device__ __host__ inline static HaloData getHalData() { -#ifdef __GPU_ARCH__ + HOST_DEVICE inline static HaloData getHalData() { +#if defined(__GPU_ARCH__) return globHalDataGPU[HaloDepth]; #else return globHalDataCPU[HaloDepth]; #endif } - __device__ __host__ inline static HaloData getHalDataReduced() { -#ifdef __GPU_ARCH__ + HOST_DEVICE inline static HaloData getHalDataReduced() { +#if defined(__GPU_ARCH__) return globHalDataGPUReduced[HaloDepth]; #else return globHalDataCPUReduced[HaloDepth]; #endif } - __device__ __host__ inline static size_t getBufferSize() { + HOST_DEVICE inline static size_t getBufferSize() { if (LatLayout == All)return getHalData().h_summed_buffer[15]; else return getHalData().h_summed_bufferHalf[15]; } - __device__ __host__ inline static size_t get_SubHaloOffset(const short number) { + HOST_DEVICE inline static size_t get_SubHaloOffset(const short number) { if (LatLayout == All)return getHalData().h_offsets[number]; else return getHalData().h_offsetsHalf[number]; } - __device__ __host__ inline static size_t get_SubHaloSize(const short number) { + HOST_DEVICE inline static size_t get_SubHaloSize(const short number) { return getHalData().get_SubHaloSize(number, LatLayout); } - __device__ __host__ inline static size_t get_ReducedSubHaloSize(const short number) { + HOST_DEVICE inline static size_t get_ReducedSubHaloSize(const short number) { return getHalDataReduced().get_SubHaloSize(number, LatLayout); } - __device__ __host__ inline static void getCoord_eo(size_t &x, size_t &y, size_t &z, size_t &t, + HOST_DEVICE inline static void getCoord_eo(size_t &x, size_t &y, size_t &z, size_t &t, const size_t index, const size_t vol1, const size_t vol2, const size_t vol3, const bool par) { @@ -405,7 +407,7 @@ class HaloIndexer { ++x; } - __device__ __host__ inline static void getCoord(size_t &x, size_t &y, size_t &z, size_t &t, + HOST_DEVICE inline static void getCoord(size_t &x, size_t &y, size_t &z, size_t &t, const size_t index, const size_t vol1, const size_t vol2, const size_t vol3) { @@ -424,20 +426,20 @@ class HaloIndexer { } - __device__ __host__ inline static void + HOST_DEVICE inline static void getHypPlanePos(size_t number, size_t &pos_a, size_t &pos_b) { pos_a = number * 2; pos_b = number * 2 + 1; } - __device__ __host__ inline static void + HOST_DEVICE inline static void getPlanePos(size_t number, size_t dir, size_t &pos_a, size_t &pos_b) { number -= 4; pos_a = 8 + number * 4 + dir; pos_b = 8 + number * 4 + dir + (3 - 2 * dir); } - __device__ __host__ inline static void + HOST_DEVICE inline static void getStripePos(size_t number, size_t dir, size_t &pos_a, size_t &pos_b) { number -= 10; @@ -445,7 +447,7 @@ class HaloIndexer { pos_b = 32 + number * 8 + dir + (7 - 2 * dir); } - __device__ __host__ inline static void + HOST_DEVICE inline static void getCornerPos(size_t number, size_t dir, size_t &pos_a, size_t &pos_b) { number -= 14; @@ -454,7 +456,7 @@ class HaloIndexer { } - __device__ __host__ inline static HaloSegment mapIntToHSeg(int bits) { + HOST_DEVICE inline static HaloSegment mapIntToHSeg(int bits) { if (bits == 1) return X; if (bits == 2) return Y; if (bits == 4) return Z; @@ -478,7 +480,7 @@ class HaloIndexer { return X; } - __device__ __host__ inline static HaloSegment getHSeg(sitexyzt coord) { + HOST_DEVICE inline static HaloSegment getHSeg(sitexyzt coord) { int bits = 0; @@ -497,7 +499,7 @@ class HaloIndexer { return mapIntToHSeg(bits); } - __device__ __host__ inline static short getlr(sitexyzt coord) { + HOST_DEVICE inline static short getlr(sitexyzt coord) { short lr = 0; HaloSegment hseg = getHSeg(coord); @@ -558,15 +560,15 @@ class HaloIndexer { } - __device__ __host__ inline static size_t getOuterHaloSize() { + HOST_DEVICE inline static size_t getOuterHaloSize() { return getHalData().getBufferSize(LatLayout); } - __device__ __host__ inline static size_t getInnerHaloSize() { + HOST_DEVICE inline static size_t getInnerHaloSize() { return getHalDataReduced().getBufferSize(LatLayout); } - __device__ __host__ inline static size_t getCenterSize() { + HOST_DEVICE inline static size_t getCenterSize() { return GIndexer::getLatData().vol4 - getInnerHaloSize(); } @@ -588,7 +590,7 @@ class HaloIndexer { /// |______________| /// - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerHaloCoord(size_t HalIndex, size_t &HalNumber, size_t &LocHalIndex) { @@ -683,7 +685,7 @@ class HaloIndexer { /// |______________| /// - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getOuterHaloCoord(size_t HalIndex, size_t &HalNumber, size_t &LocHalIndex) { HalNumber = _getHaloNumber(HalIndex, &LocHalIndex); @@ -780,7 +782,7 @@ class HaloIndexer { /// However if one does that by templating it, the compiler is not smart enough to optimize it away, /// so that this indexer become slower... - __device__ __host__ inline static sitexyzt getInnerCoord(size_t HalIndex) { + HOST_DEVICE inline static sitexyzt getInnerCoord(size_t HalIndex) { size_t HalNumber = 0, LocHalIndex = 0; HalNumber = _getHaloNumberReduced(HalIndex, &LocHalIndex); @@ -876,7 +878,7 @@ class HaloIndexer { /// +++++++++++++++++++++++++++ 8 HYPERPLANES +++++++++++++++++++++++++++++++ /// lr = 0,1 - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerHaloCoord_Hyperplane_X(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -890,7 +892,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerHaloCoord_Hyperplane_Y(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -904,7 +906,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerHaloCoord_Hyperplane_Z(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -918,7 +920,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerHaloCoord_Hyperplane_T(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -934,7 +936,7 @@ class HaloIndexer { /// +++++++++++++++++++++++++++++ 24 PLANES +++++++++++++++++++++++++++++++++ /// lr = 0-3 - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerHaloCoord_Plane_XY(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(z, t, x, y, LocHalIndex, getHalData().h_LZi, getHalData().h_ZT, getHalData().h_ZTH); @@ -949,7 +951,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerHaloCoord_Plane_XZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, t, x, z, LocHalIndex, getHalData().h_LYi, getHalData().h_YT, getHalData().h_YTH); @@ -963,7 +965,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerHaloCoord_Plane_XT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, z, x, t, LocHalIndex, getHalData().h_LYi, getHalData().h_YZ, getHalData().h_YZH); @@ -977,7 +979,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerHaloCoord_Plane_YZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, t, y, z, LocHalIndex, getHalData().h_LXi, getHalData().h_XT, getHalData().h_XTH); @@ -991,7 +993,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerHaloCoord_Plane_YT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, z, y, t, LocHalIndex, getHalData().h_LXi, getHalData().h_XZ, getHalData().h_XZH); @@ -1005,7 +1007,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerHaloCoord_Plane_ZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, y, z, t, LocHalIndex, getHalData().h_LXi, getHalData().h_XY, getHalData().h_XYH); @@ -1021,7 +1023,7 @@ class HaloIndexer { /// ++++++++++++++++++++++++++++ 32 STRIPES +++++++++++++++++++++++++++++++++ /// lr = 0-7 - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerHaloCoord_Stripe_XYZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(t, x, y, z, LocHalIndex, getHalData().h_LTi, getHalData().h_TH, getHalData().h_THH); @@ -1036,7 +1038,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerHaloCoord_Stripe_XYT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(z, x, y, t, LocHalIndex, getHalData().h_LZi, getHalData().h_ZH, getHalData().h_ZHH); @@ -1051,7 +1053,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerHaloCoord_Stripe_XZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, x, z, t, LocHalIndex, getHalData().h_LYi, getHalData().h_YH, getHalData().h_YHH); @@ -1066,7 +1068,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerHaloCoord_Stripe_YZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, y, z, t, LocHalIndex, getHalData().h_LXi, getHalData().h_XH, getHalData().h_XHH); @@ -1083,7 +1085,7 @@ class HaloIndexer { /// ++++++++++++++++++++++++++++ 16 CORNERS +++++++++++++++++++++++++++++++++ /// lr = 0-15 - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerHaloCoord_Corner(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1122,7 +1124,7 @@ class HaloIndexer { /// +++++++++++++++++++++++++++ 8 HYPERPLANES +++++++++++++++++++++++++++++++ /// lr = 0,1 - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getOuterHaloCoord_Hyperplane_X(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1135,7 +1137,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getOuterHaloCoord_Hyperplane_Y(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1148,7 +1150,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getOuterHaloCoord_Hyperplane_Z(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1161,7 +1163,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getOuterHaloCoord_Hyperplane_T(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1176,7 +1178,7 @@ class HaloIndexer { /// +++++++++++++++++++++++++++++ 24 PLANES +++++++++++++++++++++++++++++++++ /// lr = 0-3 - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getOuterHaloCoord_Plane_XY(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(z, t, x, y, LocHalIndex, getHalData().h_LZi, getHalData().h_ZT, getHalData().h_ZTH); @@ -1189,7 +1191,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getOuterHaloCoord_Plane_XZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, t, x, z, LocHalIndex, getHalData().h_LYi, getHalData().h_YT, getHalData().h_YTH); @@ -1201,7 +1203,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getOuterHaloCoord_Plane_XT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, z, x, t, LocHalIndex, getHalData().h_LYi, getHalData().h_YZ, getHalData().h_YZH); @@ -1213,7 +1215,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getOuterHaloCoord_Plane_YZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, t, y, z, LocHalIndex, getHalData().h_LXi, getHalData().h_XT, getHalData().h_XTH); @@ -1225,7 +1227,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getOuterHaloCoord_Plane_YT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, z, y, t, LocHalIndex, getHalData().h_LXi, getHalData().h_XZ, getHalData().h_XZH); @@ -1237,7 +1239,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getOuterHaloCoord_Plane_ZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, y, z, t, LocHalIndex, getHalData().h_LXi, getHalData().h_XY, getHalData().h_XYH); @@ -1251,7 +1253,7 @@ class HaloIndexer { /// ++++++++++++++++++++++++++++ 32 STRIPES +++++++++++++++++++++++++++++++++ /// lr = 0-7 - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getOuterHaloCoord_Stripe_XYZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(t, x, y, z, LocHalIndex, getHalData().h_LTi, getHalData().h_TH, getHalData().h_THH); @@ -1263,7 +1265,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getOuterHaloCoord_Stripe_XYT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(z, x, y, t, LocHalIndex, getHalData().h_LZi, getHalData().h_ZH, getHalData().h_ZHH); @@ -1275,7 +1277,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getOuterHaloCoord_Stripe_XZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, x, z, t, LocHalIndex, getHalData().h_LYi, getHalData().h_YH, getHalData().h_YHH); @@ -1287,7 +1289,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getOuterHaloCoord_Stripe_YZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, y, z, t, LocHalIndex, getHalData().h_LXi, getHalData().h_XH, getHalData().h_XHH); @@ -1301,7 +1303,7 @@ class HaloIndexer { /// ++++++++++++++++++++++++++++ 16 CORNERS +++++++++++++++++++++++++++++++++ /// lr = 0-15 - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getOuterHaloCoord_Corner(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1338,7 +1340,7 @@ class HaloIndexer { /// +++++++++++++++++++++++++++ 8 HYPERPLANES +++++++++++++++++++++++++++++++ /// lr = 0,1 - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerCoord_Hyperplane_X(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1352,7 +1354,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerCoord_Hyperplane_Y(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1366,7 +1368,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerCoord_Hyperplane_Z(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1380,7 +1382,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerCoord_Hyperplane_T(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1396,7 +1398,7 @@ class HaloIndexer { /// +++++++++++++++++++++++++++++ 24 PLANES +++++++++++++++++++++++++++++++++ /// lr = 0-3 - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerCoord_Plane_XY(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(z, t, x, y, LocHalIndex, getHalDataReduced().h_LZi, getHalDataReduced().h_ZT, @@ -1409,7 +1411,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerCoord_Plane_XZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, t, x, z, LocHalIndex, getHalDataReduced().h_LYi, getHalDataReduced().h_YT, @@ -1422,7 +1424,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerCoord_Plane_XT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, z, x, t, LocHalIndex, getHalDataReduced().h_LYi, getHalDataReduced().h_YZ, @@ -1435,7 +1437,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerCoord_Plane_YZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, t, y, z, LocHalIndex, getHalDataReduced().h_LXi, getHalDataReduced().h_XT, @@ -1448,7 +1450,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerCoord_Plane_YT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, z, y, t, LocHalIndex, getHalDataReduced().h_LXi, getHalDataReduced().h_XZ, @@ -1461,7 +1463,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerCoord_Plane_ZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, y, z, t, LocHalIndex, getHalDataReduced().h_LXi, getHalDataReduced().h_XY, @@ -1476,7 +1478,7 @@ class HaloIndexer { /// ++++++++++++++++++++++++++++ 32 STRIPES +++++++++++++++++++++++++++++++++ /// lr = 0-7 - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerCoord_Stripe_XYZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(t, x, y, z, LocHalIndex, getHalDataReduced().h_LTi, getHalDataReduced().h_TH, @@ -1489,7 +1491,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerCoord_Stripe_XYT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(z, x, y, t, LocHalIndex, getHalDataReduced().h_LZi, getHalDataReduced().h_ZH, @@ -1502,7 +1504,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerCoord_Stripe_XZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, x, z, t, LocHalIndex, getHalDataReduced().h_LYi, getHalDataReduced().h_YH, @@ -1515,7 +1517,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerCoord_Stripe_YZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, y, z, t, LocHalIndex, getHalDataReduced().h_LXi, getHalDataReduced().h_XH, @@ -1530,7 +1532,7 @@ class HaloIndexer { /// ++++++++++++++++++++++++++++ 16 CORNERS +++++++++++++++++++++++++++++++++ /// lr = 0-15 - __device__ __host__ inline static sitexyzt + HOST_DEVICE inline static sitexyzt getInnerCoord_Corner(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1564,7 +1566,7 @@ class HaloIndexer { /// - __device__ __host__ inline static sitexyzt getCenterCoord(size_t CenterIndex) { + HOST_DEVICE inline static sitexyzt getCenterCoord(size_t CenterIndex) { size_t x = 0, y = 0, z = 0, t = 0; diff --git a/src/base/indexer/initGPUIndexer.cpp b/src/base/indexer/initGPUIndexer.cpp index 05ae7d25..c69db474 100644 --- a/src/base/indexer/initGPUIndexer.cpp +++ b/src/base/indexer/initGPUIndexer.cpp @@ -9,6 +9,7 @@ #include "BulkIndexer.h" #include "../indexer/HaloIndexer.h" +#ifndef USE_CPU_ONLY __device__ __constant__ struct LatticeData globLatDataGPU[MAXHALO+1]; @@ -62,3 +63,5 @@ void initGPUHaloIndexer(size_t lx, size_t ly, size_t lz, size_t lt, unsigned int if (gpuErr) GpuError("initGPUHaloIndexer: gpuDeviceSynchronize failed (2)", gpuErr); } + +#endif \ No newline at end of file diff --git a/src/base/math/correlators.h b/src/base/math/correlators.h index 3d854056..4743261f 100644 --- a/src/base/math/correlators.h +++ b/src/base/math/correlators.h @@ -35,37 +35,37 @@ /// Initialize the correlator to zero, regardless of type. ---------------------------------- FUNCTIONS FOR CORRELATIONS template -__host__ __device__ void inline initCorrToZero(int &corr) { +HOST_DEVICE void inline initCorrToZero(int &corr) { corr = 0; } template -__host__ __device__ void inline initCorrToZero(floatT &corr) { +HOST_DEVICE void inline initCorrToZero(floatT &corr) { corr = 0.; } template -__host__ __device__ void inline initCorrToZero(GSU3 &corr) { +HOST_DEVICE void inline initCorrToZero(GSU3 &corr) { corr = gsu3_zero(); } template -__host__ __device__ void inline initCorrToZero(GCOMPLEX(floatT) &corr) { +HOST_DEVICE void inline initCorrToZero(GCOMPLEX(floatT) &corr) { corr = GPUcomplex(0., 0.); } /// Initialize the correlator to one, regardless of type. template -__host__ __device__ void inline initCorrToOne(int &corr) { +HOST_DEVICE void inline initCorrToOne(int &corr) { corr = 1; } template -__host__ __device__ void inline initCorrToOne(floatT &corr) { +HOST_DEVICE void inline initCorrToOne(floatT &corr) { corr = 1.; } template -__host__ __device__ void inline initCorrToOne(GSU3 &corr) { +HOST_DEVICE void inline initCorrToOne(GSU3 &corr) { corr = gsu3_one(); } template -__host__ __device__ void inline initCorrToOne(GCOMPLEX(floatT) &corr) { +HOST_DEVICE void inline initCorrToOne(GCOMPLEX(floatT) &corr) { corr = GPUcomplex(1., 0.); } @@ -229,14 +229,14 @@ class CorrelatorTools { void readNorm(std::string domain, Correlator &normalization, std::string normFileDir); /// Displacement vector de-indexing. - inline __host__ __device__ void indexToSpaceTimeDisplacement(size_t dindex, int &dx, int &dy, int &dz, int &dt) { + inline HOST_DEVICE void indexToSpaceTimeDisplacement(size_t dindex, int &dx, int &dy, int &dz, int &dt) { int rem2, rem1; divmod(dindex,svol3,dt,rem2); divmod(rem2 ,svol2,dz,rem1); divmod(rem1 ,svol1,dy,dx); } - inline __host__ __device__ void indexToSpatialDisplacement(size_t dindex, int &dx, int &dy, int &dz) { + inline HOST_DEVICE void indexToSpatialDisplacement(size_t dindex, int &dx, int &dy, int &dz) { int rem; divmod(dindex,svol2,dz,rem); divmod(rem ,svol1,dy,dx); @@ -375,7 +375,7 @@ class CorrelationDegeneracies : public LatticeContainer, public /// Trivial read index, in case you need/want to do indexing inside the Kernel. TODO: Probably should be in indexer? struct PassIndex { - inline __host__ __device__ size_t operator()(const dim3& blockDim, const uint3& blockIdx, const uint3& threadIdx) { + inline HOST_DEVICE size_t operator()(const dim3& blockDim, const uint3& blockIdx, const uint3& threadIdx) { return blockDim.x * blockIdx.x + threadIdx.x; } }; @@ -383,7 +383,7 @@ struct PassIndex { /// For fields that depend on x. template struct ReadIndexSpacetime { - inline __host__ __device__ gSite operator()(const dim3& blockDim, const uint3& blockIdx, const uint3& threadIdx) { + inline HOST_DEVICE gSite operator()(const dim3& blockDim, const uint3& blockIdx, const uint3& threadIdx) { size_t i = blockDim.x * blockIdx.x + threadIdx.x; typedef GIndexer GInd; gSite site = GInd::getSite(i); @@ -394,7 +394,7 @@ struct ReadIndexSpacetime { /// For fields that depend on spatial x. template struct ReadIndexSpatial { - inline __host__ __device__ gSite operator()(const dim3& blockDim, const uint3& blockIdx, const uint3& threadIdx) { + inline HOST_DEVICE gSite operator()(const dim3& blockDim, const uint3& blockIdx, const uint3& threadIdx) { size_t i = blockDim.x * blockIdx.x + threadIdx.x; typedef GIndexer GInd; gSite site = GInd::getSiteSpatial(i); @@ -416,13 +416,13 @@ struct ReadIndexSpatial { template class AxB { public: - __host__ __device__ floatT inline orrelate(floatT A, floatT B) { + HOST_DEVICE floatT inline orrelate(floatT A, floatT B) { return A*B; } - __host__ __device__ GCOMPLEX(floatT) inline orrelate(GCOMPLEX(floatT) A, GCOMPLEX(floatT) B) { + HOST_DEVICE GCOMPLEX(floatT) inline orrelate(GCOMPLEX(floatT) A, GCOMPLEX(floatT) B) { return A*B; } - __host__ __device__ GCOMPLEX(floatT) inline orrelate(GSU3 A, GSU3 B) { + HOST_DEVICE GCOMPLEX(floatT) inline orrelate(GSU3 A, GSU3 B) { return tr_c(A*B); } }; @@ -430,7 +430,7 @@ class AxB { template class trAxtrBt { public: - __host__ __device__ GCOMPLEX(floatT) inline orrelate(GSU3 A, GSU3 B) { + HOST_DEVICE GCOMPLEX(floatT) inline orrelate(GSU3 A, GSU3 B) { return tr_c(A)*tr_c(dagger(B)); } }; @@ -438,7 +438,7 @@ class trAxtrBt { template class trReAxtrReB { public: - __host__ __device__ floatT inline orrelate(GSU3 A, GSU3 B) { + HOST_DEVICE floatT inline orrelate(GSU3 A, GSU3 B) { return tr_d(A)*tr_d(B); } }; @@ -446,7 +446,7 @@ class trReAxtrReB { template class trImAxtrImB { public: - __host__ __device__ floatT inline orrelate(GSU3 A, GSU3 B) { + HOST_DEVICE floatT inline orrelate(GSU3 A, GSU3 B) { return tr_i(A)*tr_i(B); } }; @@ -454,7 +454,7 @@ class trImAxtrImB { template class trAxBt { public: - __host__ __device__ GCOMPLEX(floatT) inline orrelate(GSU3 A, GSU3 B) { + HOST_DEVICE GCOMPLEX(floatT) inline orrelate(GSU3 A, GSU3 B) { return tr_c(A*dagger(B)); } }; @@ -462,21 +462,21 @@ class trAxBt { template class polCorrAVG { public: - __host__ __device__ floatT inline orrelate(GSU3 A, GSU3 B) { + HOST_DEVICE floatT inline orrelate(GSU3 A, GSU3 B) { return real(tr_c(A)*tr_c(dagger(B)))/9.; } }; template class polCorrSIN { public: - __host__ __device__ floatT inline orrelate(GSU3 A, GSU3 B) { + HOST_DEVICE floatT inline orrelate(GSU3 A, GSU3 B) { return tr_d(A,dagger(B))/3.; } }; template class polCorrOCT { public: - __host__ __device__ floatT inline orrelate(GSU3 A, GSU3 B) { + HOST_DEVICE floatT inline orrelate(GSU3 A, GSU3 B) { floatT avg = real(tr_c(A)*tr_c(dagger(B))); floatT sin = tr_d(A,dagger(B)); return (0.125*avg - 0.04166666666*sin); @@ -499,7 +499,7 @@ struct SpacetimePairKernelSymm : CorrelatorTools { SpacetimePairKernelSymm(LatticeContainerAccessor field1, LatticeContainerAccessor field2, LatticeContainerAccessor field1Xfield2, size_t dindex) : _field1(field1), _field2(field2), _field1Xfield2(field1Xfield2), _dindex(dindex), CorrelatorTools() {} - __device__ __host__ void operator()(gSite site) { + HOST_DEVICE void operator()(gSite site) { typedef GIndexer GInd; size_t m, n; fieldType field1m, field2n; @@ -568,7 +568,7 @@ struct SpacetimePairKernel : CorrelatorTools { SpacetimePairKernel(LatticeContainerAccessor field1, LatticeContainerAccessor field2, LatticeContainerAccessor field1Xfield2, size_t dindex) : _field1(field1), _field2(field2), _field1Xfield2(field1Xfield2), _dindex(dindex), CorrelatorTools() {} - __device__ __host__ void operator()(gSite site) { + HOST_DEVICE void operator()(gSite site) { typedef GIndexer GInd; size_t m, n; fieldType field1m, field2n; @@ -670,7 +670,7 @@ struct SpatialPairKernelSymm : CorrelatorTools { SpatialPairKernelSymm(LatticeContainerAccessor field1, LatticeContainerAccessor field2, LatticeContainerAccessor field1Xfield2, size_t dindex) : _field1(field1), _field2(field2), _field1Xfield2(field1Xfield2), _dindex(dindex), CorrelatorTools() {} - __device__ __host__ void operator()(gSite site) { + HOST_DEVICE void operator()(gSite site) { typedef GIndexer GInd; size_t m, n; fieldType field1m, field2n; @@ -722,7 +722,7 @@ struct SpatialPairKernel : CorrelatorTools { SpatialPairKernel(LatticeContainerAccessor field1, LatticeContainerAccessor field2, LatticeContainerAccessor field1Xfield2, size_t dindex) : _field1(field1), _field2(field2), _field1Xfield2(field1Xfield2), _dindex(dindex), CorrelatorTools() {} - __device__ __host__ void operator()(gSite site) { + HOST_DEVICE void operator()(gSite site) { typedef GIndexer GInd; size_t m, n; fieldType field1m, field2n; @@ -797,7 +797,7 @@ struct RestrictedOffAxisKernel : CorrelatorTools { /// direction. (Backward correlations will be counted from the forward correlation of some other m.) A possible /// displacement is (1,0,0); therefore some on-axis correlations are computed already in the off-axis kernel. This /// is taken into account in the on-axis kernel. - __device__ __host__ void operator()(size_t dindex) { /// dindex indexes displacement vector + HOST_DEVICE void operator()(size_t dindex) { /// dindex indexes displacement vector typedef GIndexer GInd; size_t m,n1,n2,n3,n4; @@ -855,7 +855,7 @@ struct RestrictedOnAxisKernel : CorrelatorTools { : _field1(field1), _field2(field2), _field1Xfield2off(field1Xfield2off), _field1Xfield2on(field1Xfield2on), CorrelatorTools() {} - __device__ __host__ void operator()(size_t dx){ /// Now dx corresponds to a separation, rather than a displacement + HOST_DEVICE void operator()(size_t dx){ /// Now dx corresponds to a separation, rather than a displacement typedef GIndexer GInd; size_t m,n1,n2,n3; diff --git a/src/base/math/floatComparison.h b/src/base/math/floatComparison.h index a57e9096..69a3703a 100644 --- a/src/base/math/floatComparison.h +++ b/src/base/math/floatComparison.h @@ -15,7 +15,7 @@ /// This can be used on the GPU. template -__host__ __device__ bool cmp_rel(const T a, const T b, const double rel, const double prec) { +HOST_DEVICE bool cmp_rel(const T a, const T b, const double rel, const double prec) { if (abs(a-b) / abs(a+b) < rel && abs(a-b) < prec) { return true; } @@ -25,7 +25,7 @@ __host__ __device__ bool cmp_rel(const T a, const T b, const double rel, const d /// Implements relative method - do not use for comparing with zero. Use this most of the time, tolerance needs to /// be meaningful in your context. template -__host__ __device__ static bool isApproximatelyEqual(const TReal a, const TReal b, const TReal tolerance = std::numeric_limits::epsilon()) +HOST_DEVICE static bool isApproximatelyEqual(const TReal a, const TReal b, const TReal tolerance = std::numeric_limits::epsilon()) { TReal diff = std::fabs(a - b); if (diff <= tolerance) diff --git a/src/base/math/gaugeAccessor.h b/src/base/math/gaugeAccessor.h index 3cae434c..bf291048 100644 --- a/src/base/math/gaugeAccessor.h +++ b/src/base/math/gaugeAccessor.h @@ -9,7 +9,7 @@ #define BACKWARD_CONST 16 -__host__ __device__ inline int Back(const int i) { +HOST_DEVICE inline int Back(const int i) { return i + BACKWARD_CONST; } @@ -23,62 +23,62 @@ class gaugeAccessor : public GaugeConstructor { : GaugeConstructor(elements) {} /// Constructor for one memory chunk, where all entries are separated by object_count - __host__ __device__ explicit gaugeAccessor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) + HOST_DEVICE explicit gaugeAccessor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) : GaugeConstructor(elementsBase, object_count) {} explicit gaugeAccessor() : GaugeConstructor() {} template - __host__ __device__ inline GSU3 getElement(const gSiteMu &siteMu) const { + HOST_DEVICE inline GSU3 getElement(const gSiteMu &siteMu) const { return static_cast>(this->reconstruct(siteMu)); } template - __host__ __device__ inline size_t getIndexComm(size_t isiteFull, size_t mu) const { + HOST_DEVICE inline size_t getIndexComm(size_t isiteFull, size_t mu) const { gSiteMu siteMu = GIndexer::getSiteMuFull(isiteFull, mu); return siteMu.indexMuFull; } template - __host__ __device__ inline GSU3 getElementComm(size_t isiteFull, size_t mu) const { + HOST_DEVICE inline GSU3 getElementComm(size_t isiteFull, size_t mu) const { gSiteMu siteMu = GIndexer::getSiteMuFull(isiteFull, mu); return getElement(siteMu); } template - __host__ __device__ inline void setElementComm(size_t isiteFull, const GSU3& mat) { + HOST_DEVICE inline void setElementComm(size_t isiteFull, const GSU3& mat) { gSiteMu siteMu; siteMu.indexMuFull = isiteFull; setElement(siteMu, mat); } template - __host__ __device__ inline void setElement(const gSiteMu &siteMu, const GSU3 &mat) { + HOST_DEVICE inline void setElement(const gSiteMu &siteMu, const GSU3 &mat) { this->construct(siteMu, static_cast>(mat)); } template - __host__ __device__ inline GSU3 getLink(const gSiteMu &siteMu) const { + HOST_DEVICE inline GSU3 getLink(const gSiteMu &siteMu) const { return static_cast>(this->reconstruct(siteMu)); } template - __host__ __device__ inline GSU3 getLinkDagger(const gSiteMu &siteMu) const { + HOST_DEVICE inline GSU3 getLinkDagger(const gSiteMu &siteMu) const { return static_cast>(this->reconstructDagger(siteMu)); } template - __host__ __device__ inline void setLink(const gSiteMu &siteMu, GSU3 mat) { + HOST_DEVICE inline void setLink(const gSiteMu &siteMu, GSU3 mat) { this->construct(siteMu, static_cast>(mat)); } template - __host__ __device__ inline GSU3 operator()(const gSiteMu &siteMu) const { + HOST_DEVICE inline GSU3 operator()(const gSiteMu &siteMu) const { return static_cast>(this->reconstruct(siteMu)); } template - __host__ __device__ inline GSU3 getLinkPath(gSite &site, int dir) const { + HOST_DEVICE inline GSU3 getLinkPath(gSite &site, int dir) const { typedef GIndexer GInd; @@ -103,7 +103,7 @@ class gaugeAccessor : public GaugeConstructor { template - __host__ __device__ inline GSU3 getLinkPath(gSite &site, int dir, Args... args) const { + HOST_DEVICE inline GSU3 getLinkPath(gSite &site, int dir, Args... args) const { typedef GIndexer GInd; @@ -128,7 +128,7 @@ class gaugeAccessor : public GaugeConstructor { } template - __host__ __device__ inline GSU3 getLinkPath(gSiteMu &siteMu, int dir, Args... args) const { + HOST_DEVICE inline GSU3 getLinkPath(gSiteMu &siteMu, int dir, Args... args) const { typedef GIndexer GInd; gSite site = siteMu; diff --git a/src/base/math/gaugeConstructor.h b/src/base/math/gaugeConstructor.h index e7ddd6c6..be1cfb1c 100644 --- a/src/base/math/gaugeConstructor.h +++ b/src/base/math/gaugeConstructor.h @@ -31,7 +31,7 @@ struct GaugeConstructor : public GeneralAccessor::count >(elements) { } /// Constructor for one memory chunk, where all entries are separated by object_count - __host__ __device__ explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) + HOST_DEVICE explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) : GeneralAccessor::count >(elementsBase, object_count){ } explicit GaugeConstructor() : GeneralAccessor::count >(){ } @@ -44,12 +44,12 @@ struct GaugeConstructor : public GeneralAccessor::count]) : GeneralAccessor::count>(elements) {} - __host__ __device__ explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) + HOST_DEVICE explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) : GeneralAccessor::count >(elementsBase, object_count) {} explicit GaugeConstructor() : GeneralAccessor::count>() {} - __host__ __device__ inline void setEntriesComm(GaugeConstructor &src_acc, + HOST_DEVICE inline void setEntriesComm(GaugeConstructor &src_acc, size_t setIndex, size_t getIndex) { this->template setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); this->template setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); @@ -62,7 +62,7 @@ struct GaugeConstructor : public GeneralAccessortemplate setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); } - __host__ __device__ inline void construct(const gSiteMu& idx, const GSU3 &mat) { + HOST_DEVICE inline void construct(const gSiteMu& idx, const GSU3 &mat) { this->template setElementEntry(idx.indexMuFull, mat.getLink00()); this->template setElementEntry(idx.indexMuFull, mat.getLink01()); this->template setElementEntry(idx.indexMuFull, mat.getLink02()); @@ -74,7 +74,7 @@ struct GaugeConstructor : public GeneralAccessortemplate setElementEntry(idx.indexMuFull, mat.getLink22()); } - __host__ __device__ inline GSU3 reconstruct(const gSiteMu& idx) const { + HOST_DEVICE inline GSU3 reconstruct(const gSiteMu& idx) const { GSU3 ret( this->template getElementEntry(idx.indexMuFull), this->template getElementEntry(idx.indexMuFull), @@ -88,7 +88,7 @@ struct GaugeConstructor : public GeneralAccessor reconstructDagger(const gSiteMu& idx) const { + HOST_DEVICE inline GSU3 reconstructDagger(const gSiteMu& idx) const { return GSU3(conj(this->template getElementEntry(idx.indexMuFull)), conj(this->template getElementEntry(idx.indexMuFull)), conj(this->template getElementEntry(idx.indexMuFull)), @@ -108,13 +108,13 @@ struct GaugeConstructor : public GeneralAccessor::count]) : GeneralAccessor::count >(elements) { } - __host__ __device__ explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) + HOST_DEVICE explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) : GeneralAccessor::count >(elementsBase, object_count){ } explicit GaugeConstructor() : GeneralAccessor::count >(){ } - __host__ __device__ inline void setEntriesComm(GaugeConstructor &src_acc, + HOST_DEVICE inline void setEntriesComm(GaugeConstructor &src_acc, size_t setIndex, size_t getIndex) { this->template setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); this->template setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); @@ -125,7 +125,7 @@ struct GaugeConstructor : public GeneralAccessortemplate setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); } - __host__ __device__ inline void construct(const gSiteMu& idx, const GSU3 &mat) { + HOST_DEVICE inline void construct(const gSiteMu& idx, const GSU3 &mat) { this->template setElementEntry(idx.indexMuFull, mat.getLink00()); this->template setElementEntry(idx.indexMuFull, mat.getLink01()); this->template setElementEntry(idx.indexMuFull, mat.getLink02()); @@ -135,7 +135,7 @@ struct GaugeConstructor : public GeneralAccessortemplate setElementEntry(idx.indexMuFull, det(mat)); } - __host__ __device__ inline GSU3 reconstruct(const gSiteMu& idx) const { + HOST_DEVICE inline GSU3 reconstruct(const gSiteMu& idx) const { GSU3 ret( this->template getElementEntry(idx.indexMuFull), this->template getElementEntry(idx.indexMuFull), @@ -150,7 +150,7 @@ struct GaugeConstructor : public GeneralAccessor reconstructDagger(const gSiteMu& idx) const { + HOST_DEVICE inline GSU3 reconstructDagger(const gSiteMu& idx) const { GSU3 tmp = GSU3( conj(this->template getElementEntry(idx.indexMuFull)), conj(this->template getElementEntry(idx.indexMuFull)), @@ -175,12 +175,12 @@ struct GaugeConstructor : public GeneralAccessor::count]) : GeneralAccessor::count >(elements) { } - __host__ __device__ explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) + HOST_DEVICE explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) : GeneralAccessor::count >(elementsBase, object_count) { } explicit GaugeConstructor() : GeneralAccessor::count>() { } - __host__ __device__ inline void setEntriesComm(GaugeConstructor &src_acc, + HOST_DEVICE inline void setEntriesComm(GaugeConstructor &src_acc, size_t setIndex, size_t getIndex) { this->template setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); this->template setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); @@ -191,7 +191,7 @@ struct GaugeConstructor : public GeneralAccessortemplate setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); } - __host__ __device__ inline void construct(const gSiteMu& idx, const GSU3 &mat) { + HOST_DEVICE inline void construct(const gSiteMu& idx, const GSU3 &mat) { this->template setElementEntry(idx.indexMuFull, mat.getLink00()); this->template setElementEntry(idx.indexMuFull, mat.getLink01()); this->template setElementEntry(idx.indexMuFull, mat.getLink02()); @@ -201,7 +201,7 @@ struct GaugeConstructor : public GeneralAccessortemplate setElementEntry(idx.indexMuFull, det(mat)); } - __host__ __device__ inline GSU3 reconstruct(const gSiteMu& idx) const { + HOST_DEVICE inline GSU3 reconstruct(const gSiteMu& idx) const { GSU3 ret( this->template getElementEntry(idx.indexMuFull), this->template getElementEntry(idx.indexMuFull), @@ -216,7 +216,7 @@ struct GaugeConstructor : public GeneralAccessor reconstructDagger(const gSiteMu& idx) const { + HOST_DEVICE inline GSU3 reconstructDagger(const gSiteMu& idx) const { GSU3 tmp = GSU3( conj(this->template getElementEntry(idx.indexMuFull)), conj(this->template getElementEntry(idx.indexMuFull)), @@ -240,12 +240,12 @@ struct GaugeConstructor : public GeneralAccessor::count]) : GeneralAccessor::count >(elements) { } - __host__ __device__ explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) + HOST_DEVICE explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) : GeneralAccessor::count>(elementsBase, object_count) { } explicit GaugeConstructor() : GeneralAccessor::count>() { } - __host__ __device__ inline void setEntriesComm(GaugeConstructor &src_acc, + HOST_DEVICE inline void setEntriesComm(GaugeConstructor &src_acc, size_t setIndex, size_t getIndex) { this->template setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); this->template setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); @@ -255,7 +255,7 @@ struct GaugeConstructor : public GeneralAccessortemplate setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); } - __host__ __device__ inline void construct(const gSiteMu& idx, const GSU3 &mat) { + HOST_DEVICE inline void construct(const gSiteMu& idx, const GSU3 &mat) { this->template setElementEntry(idx.indexMuFull, mat.getLink00()); this->template setElementEntry(idx.indexMuFull, mat.getLink01()); this->template setElementEntry(idx.indexMuFull, mat.getLink02()); @@ -264,7 +264,7 @@ struct GaugeConstructor : public GeneralAccessortemplate setElementEntry(idx.indexMuFull, mat.getLink12()); } - __host__ __device__ inline GSU3 reconstruct(const gSiteMu& idx) const { + HOST_DEVICE inline GSU3 reconstruct(const gSiteMu& idx) const { GSU3 ret( this->template getElementEntry(idx.indexMuFull), this->template getElementEntry(idx.indexMuFull), @@ -278,7 +278,7 @@ struct GaugeConstructor : public GeneralAccessor reconstructDagger(const gSiteMu& idx) const { + HOST_DEVICE inline GSU3 reconstructDagger(const gSiteMu& idx) const { GSU3 tmp = GSU3( conj(this->template getElementEntry(idx.indexMuFull)), conj(this->template getElementEntry(idx.indexMuFull)), @@ -309,12 +309,12 @@ struct GaugeConstructor : public GeneralAccessor::count>(elements) { throw std::runtime_error(stdLogger.fatal("STAGG_R12 should not be used at the moment")); } - __host__ __device__ explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) + HOST_DEVICE explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) : GeneralAccessor::count >(elementsBase, object_count) { } explicit GaugeConstructor() : GeneralAccessor::count>() { } - __host__ __device__ inline void setEntriesComm(GaugeConstructor &src_acc, + HOST_DEVICE inline void setEntriesComm(GaugeConstructor &src_acc, size_t setIndex, size_t getIndex) { this->template setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); this->template setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); @@ -324,7 +324,7 @@ struct GaugeConstructor : public GeneralAccessortemplate setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); } - __host__ __device__ inline void construct(const gSiteMu& idx, const GSU3 &mat) { + HOST_DEVICE inline void construct(const gSiteMu& idx, const GSU3 &mat) { this->template setElementEntry(idx.indexMuFull, mat.getLink00()); this->template setElementEntry(idx.indexMuFull, mat.getLink01()); this->template setElementEntry(idx.indexMuFull, mat.getLink02()); @@ -333,7 +333,7 @@ struct GaugeConstructor : public GeneralAccessortemplate setElementEntry(idx.indexMuFull, mat.getLink12()); } - __host__ __device__ inline GSU3 reconstruct(const gSiteMu& idx) const { + HOST_DEVICE inline GSU3 reconstruct(const gSiteMu& idx) const { GSU3 ret( this->template getElementEntry(idx.indexMuFull), this->template getElementEntry(idx.indexMuFull), @@ -348,7 +348,7 @@ struct GaugeConstructor : public GeneralAccessor reconstructDagger(const gSiteMu& idx) const { + HOST_DEVICE inline GSU3 reconstructDagger(const gSiteMu& idx) const { GSU3 tmp = GSU3( conj(this->template getElementEntry(idx.indexMuFull)), conj(this->template getElementEntry(idx.indexMuFull)), diff --git a/src/base/math/gcomplex.h b/src/base/math/gcomplex.h index ae21fb0f..71c40a8f 100644 --- a/src/base/math/gcomplex.h +++ b/src/base/math/gcomplex.h @@ -17,6 +17,7 @@ #ifndef SP_COMPLEX_HCU #define SP_COMPLEX_HCU + #include "../wrapper/gpu_wrapper.h" #include "floatComparison.h" #include @@ -34,10 +35,12 @@ template <> class Selector { public: using Type = double2; }; +#ifndef USE_CPU_ONLY template <> class Selector<__half> { public: using Type = __half2; }; +#endif /** * A utility class to provide complex numbers for operation @@ -46,224 +49,225 @@ template <> class Selector<__half> { template ::Type> class GPUcomplex { public: - floatT2 c; + floatT2 c; #define cREAL c.x #define cIMAG c.y + /** * Default constructor, leave values uninitialized. */ - __host__ __device__ GPUcomplex(){}; + HOST_DEVICE GPUcomplex(){}; constexpr GPUcomplex(const GPUcomplex &) = default; /** * Utility constructor, creates class from given real and imaginary value */ - __host__ __device__ GPUcomplex(const floatT &real, const floatT &imag) { - c.x = real; - c.y = imag; + HOST_DEVICE GPUcomplex(const floatT &real, const floatT &imag) { + cREAL = real; + cIMAG = imag; }; /** * Utility constructor, creates class from real value, assumes imaginary value * to be zero. */ - __host__ __device__ GPUcomplex(const floatT &real) { - c.x = real; - c.y = 0.0f; + HOST_DEVICE GPUcomplex(const floatT &real) { + cREAL = real; + cIMAG = 0.0f; }; - __host__ GPUcomplex(const std::complex &orig) { - c.x = std::real(orig); - c.y = std::imag(orig); + HOST GPUcomplex(const std::complex &orig) { + cREAL = std::real(orig); + cIMAG = std::imag(orig); } - __host__ GPUcomplex(const std::complex &orig) { - c.x = std::real(orig); - c.y = std::imag(orig); + HOST GPUcomplex(const std::complex &orig) { + cREAL = std::real(orig); + cIMAG = std::imag(orig); } - __host__ __device__ GPUcomplex &operator=(const GPUcomplex &orig) { + HOST_DEVICE GPUcomplex &operator=(const GPUcomplex &orig) { this->c = static_cast(orig.c); return *this; } - __host__ __device__ GPUcomplex &operator=(const GPUcomplex &orig) { + HOST_DEVICE GPUcomplex &operator=(const GPUcomplex &orig) { this->c = static_cast(orig.c); return *this; } - __host__ __device__ GPUcomplex &operator=(const floatT &orig) { - this->c.x = orig; - this->c.y = 0.0f; + HOST_DEVICE GPUcomplex &operator=(const floatT &orig) { + this->cREAL = orig; + this->cIMAG = 0.0f; return *this; } - __host__ __device__ GPUcomplex &operator+=(const GPUcomplex &op) { - this->c.x += op.c.x; - this->c.y += op.c.y; + HOST_DEVICE GPUcomplex &operator+=(const GPUcomplex &op) { + this->cREAL += op.cREAL; + this->cIMAG += op.cIMAG; return *this; } - __host__ __device__ GPUcomplex &operator+=(const floatT &op) { - this->c.x += op; + HOST_DEVICE GPUcomplex &operator+=(const floatT &op) { + this->cREAL += op; return *this; } - __host__ __device__ GPUcomplex &operator-=(const GPUcomplex &op) { - this->c.x -= op.c.x; - this->c.y -= op.c.y; + HOST_DEVICE GPUcomplex &operator-=(const GPUcomplex &op) { + this->cREAL -= op.cREAL; + this->cIMAG -= op.cIMAG; return *this; } - __host__ __device__ GPUcomplex &operator-=(const floatT &op) { - this->c.x -= op; + HOST_DEVICE GPUcomplex &operator-=(const floatT &op) { + this->cREAL -= op; return *this; } - __host__ __device__ GPUcomplex &operator*=(const GPUcomplex &op) { - floatT newReal = this->c.x * op.c.x - this->c.y * op.c.y; - this->c.y = this->c.x * op.c.y + this->c.y * op.c.x; - this->c.x = newReal; + HOST_DEVICE GPUcomplex &operator*=(const GPUcomplex &op) { + floatT newReal = this->cREAL * op.cREAL - this->cIMAG * op.cIMAG; + this->cIMAG = this->cREAL * op.cIMAG + this->cIMAG * op.cREAL; + this->cREAL = newReal; return *this; } - __host__ __device__ GPUcomplex &operator*=(const floatT &op) { - this->c.x *= op; - this->c.y *= op; + HOST_DEVICE GPUcomplex &operator*=(const floatT &op) { + this->cREAL *= op; + this->cIMAG *= op; return *this; } - __host__ __device__ GPUcomplex &operator/=(const floatT &op) { - this->c.x /= op; - this->c.y /= op; + HOST_DEVICE GPUcomplex &operator/=(const floatT &op) { + this->cREAL /= op; + this->cIMAG /= op; return *this; } /// Note: You should not use this operator to compare with zero, because /// cmp_rel breaks down in that case. - __host__ __device__ bool operator==(const GPUcomplex &op) { + HOST_DEVICE bool operator==(const GPUcomplex &op) { ////TODO:: THAT PRECISION HAS TO BE CHANGED!! - return (cmp_rel(this->c.x, op.c.x, 1.e-6, 1.e-6) && - cmp_rel(this->c.y, op.c.y, 1.e-6, 1.e-6)); - // return (isApproximatelyEqual(this->c.x, op.c.x, 1.e-14) && - //isApproximatelyEqual(this->c.y, op.c.y, 1.e-14)); + return (cmp_rel(this->cREAL, op.cREAL, 1.e-6, 1.e-6) && + cmp_rel(this->cIMAG, op.cIMAG, 1.e-6, 1.e-6)); + // return (isApproximatelyEqual(this->cREAL, op.cREAL, 1.e-14) && + //isApproximatelyEqual(this->cIMAG, op.cIMAG, 1.e-14)); } - __host__ __device__ friend GPUcomplex operator+(const GPUcomplex &left, + HOST_DEVICE friend GPUcomplex operator+(const GPUcomplex &left, const GPUcomplex &right) { - return GPUcomplex(left.c.x + right.c.x, left.c.y + right.c.y); + return GPUcomplex(left.cREAL + right.cREAL, left.cIMAG + right.cIMAG); } - __host__ __device__ friend GPUcomplex operator+(const GPUcomplex &left, + HOST_DEVICE friend GPUcomplex operator+(const GPUcomplex &left, const floatT &right) { - return GPUcomplex(left.c.x + right, left.c.y); + return GPUcomplex(left.cREAL + right, left.cIMAG); } - __host__ __device__ friend GPUcomplex operator+(const floatT &left, + HOST_DEVICE friend GPUcomplex operator+(const floatT &left, const GPUcomplex &right) { - return GPUcomplex(left + right.c.x, right.c.y); + return GPUcomplex(left + right.cREAL, right.cIMAG); } - __host__ __device__ friend GPUcomplex operator-(const GPUcomplex &op) { - return GPUcomplex(-op.c.x, -op.c.y); + HOST_DEVICE friend GPUcomplex operator-(const GPUcomplex &op) { + return GPUcomplex(-op.cREAL, -op.cIMAG); } - __host__ __device__ friend GPUcomplex operator-(const GPUcomplex &left, + HOST_DEVICE friend GPUcomplex operator-(const GPUcomplex &left, const GPUcomplex &right) { - return GPUcomplex(left.c.x - right.c.x, left.c.y - right.c.y); + return GPUcomplex(left.cREAL - right.cREAL, left.cIMAG - right.cIMAG); } - __host__ __device__ friend GPUcomplex operator-(const GPUcomplex &left, + HOST_DEVICE friend GPUcomplex operator-(const GPUcomplex &left, const floatT &right) { - return GPUcomplex(left.c.x - right, left.c.y); + return GPUcomplex(left.cREAL - right, left.cIMAG); } - __host__ __device__ friend GPUcomplex operator-(const floatT &left, + HOST_DEVICE friend GPUcomplex operator-(const floatT &left, const GPUcomplex &right) { - return GPUcomplex(left - right.c.x, -right.c.y); + return GPUcomplex(left - right.cREAL, -right.cIMAG); } - __host__ __device__ friend GPUcomplex operator*(const GPUcomplex &left, + HOST_DEVICE friend GPUcomplex operator*(const GPUcomplex &left, const GPUcomplex &right) { - floatT newReal = left.c.x * right.c.x - left.c.y * right.c.y; - floatT newImag = left.c.x * right.c.y + left.c.y * right.c.x; + floatT newReal = left.cREAL * right.cREAL - left.cIMAG * right.cIMAG; + floatT newImag = left.cREAL * right.cIMAG + left.cIMAG * right.cREAL; return GPUcomplex(newReal, newImag); } - __host__ __device__ friend GPUcomplex operator*(const GPUcomplex &left, + HOST_DEVICE friend GPUcomplex operator*(const GPUcomplex &left, const floatT &right) { - return GPUcomplex(left.c.x * right, left.c.y * right); + return GPUcomplex(left.cREAL * right, left.cIMAG * right); } - __host__ __device__ friend GPUcomplex operator*(const floatT &left, + HOST_DEVICE friend GPUcomplex operator*(const floatT &left, const GPUcomplex &right) { - return GPUcomplex(left * right.c.x, left * right.c.y); + return GPUcomplex(left * right.cREAL, left * right.cIMAG); } - __host__ __device__ friend GPUcomplex + HOST_DEVICE friend GPUcomplex fma(const GPUcomplex &x, const GPUcomplex &y, const GPUcomplex &d) { floatT real_res; floatT imag_res; - real_res = (x.c.x * y.c.x) + d.c.x; - imag_res = (x.c.x * y.c.y) + d.c.y; + real_res = (x.cREAL * y.cREAL) + d.cREAL; + imag_res = (x.cREAL * y.cIMAG) + d.cIMAG; - real_res = -(x.c.y * y.c.y) + real_res; - imag_res = (x.c.y * y.c.x) + imag_res; + real_res = -(x.cIMAG * y.cIMAG) + real_res; + imag_res = (x.cIMAG * y.cREAL) + imag_res; return GPUcomplex(real_res, imag_res); } - __host__ __device__ friend GPUcomplex fma(const floatT x, const GPUcomplex &y, + HOST_DEVICE friend GPUcomplex fma(const floatT x, const GPUcomplex &y, const GPUcomplex &d) { floatT real_res; floatT imag_res; - real_res = (x * y.c.x) + d.c.x; - imag_res = (x * y.c.y) + d.c.y; + real_res = (x * y.cREAL) + d.cREAL; + imag_res = (x * y.cIMAG) + d.cIMAG; return GPUcomplex(real_res, imag_res); } - __host__ __device__ void addProduct(const GPUcomplex &x, + HOST_DEVICE void addProduct(const GPUcomplex &x, const GPUcomplex &y) { - this->c.x = (x.c.x * y.c.x) + this->c.x; - this->c.y = (x.c.x * y.c.y) + this->c.y; + this->cREAL = (x.cREAL * y.cREAL) + this->cREAL; + this->cIMAG = (x.cREAL * y.cIMAG) + this->cIMAG; - this->c.x = -(x.c.y * y.c.y) + this->c.x; - this->c.y = (x.c.y * y.c.x) + this->c.y; + this->cREAL = -(x.cIMAG * y.cIMAG) + this->cREAL; + this->cIMAG = (x.cIMAG * y.cREAL) + this->cIMAG; return; } - __host__ __device__ void addProduct(const floatT &x, const GPUcomplex &y) { - this->c.x = (x * y.c.x) + this->c.x; - this->c.y = (x * y.c.y) + this->c.y; + HOST_DEVICE void addProduct(const floatT &x, const GPUcomplex &y) { + this->cREAL = (x * y.cREAL) + this->cREAL; + this->cIMAG = (x * y.cIMAG) + this->cIMAG; return; } template - __host__ __device__ friend GPUcomplex operator/(const GPUcomplex &left, + HOST_DEVICE friend GPUcomplex operator/(const GPUcomplex &left, const T &right) { - return GPUcomplex(left.c.x / right, left.c.y / right); + return GPUcomplex(left.cREAL / right, left.cIMAG / right); } template - __host__ __device__ friend GPUcomplex operator/(const T &left, + HOST_DEVICE friend GPUcomplex operator/(const T &left, const GPUcomplex &right) { return GPUcomplex( - left * right.c.x / (right.c.x * right.c.x + right.c.y * right.c.y), - -left * right.c.y / (right.c.x * right.c.x + right.c.y * right.c.y)); + left * right.cREAL / (right.cREAL * right.cREAL + right.cIMAG * right.cIMAG), + -left * right.cIMAG / (right.cREAL * right.cREAL + right.cIMAG * right.cIMAG)); } - __host__ __device__ inline static GPUcomplex invalid(); + HOST_DEVICE inline static GPUcomplex invalid(); // These are needed to make sure that dp_complex may be part in general // operators src/math/operators.h - __host__ __device__ GPUcomplex getAccessor() const { return *this; } + HOST_DEVICE GPUcomplex getAccessor() const { return *this; } template - __host__ __device__ GPUcomplex operator()(const Index) const { + HOST_DEVICE GPUcomplex operator()(const Index) const { return *this; } }; @@ -273,61 +277,61 @@ class GPUcomplex { template <> class GPUcomplex<__half> { public: __half2 c; - __host__ __device__ GPUcomplex(){}; + HOST_DEVICE GPUcomplex(){}; - __host__ __device__ GPUcomplex(const __half &real, const __half &imag) { - c.x = real; - c.y = imag; + HOST_DEVICE GPUcomplex(const __half &real, const __half &imag) { + cREAL = real; + cIMAG = imag; }; - __host__ __device__ GPUcomplex(const __half &real) { - c.x = real; - c.y = __float2half(0.0f); + HOST_DEVICE GPUcomplex(const __half &real) { + cREAL = real; + cIMAG = __float2half(0.0f); }; - __host__ __device__ GPUcomplex(const __half2 &vec_type) { c = vec_type; }; + HOST_DEVICE GPUcomplex(const __half2 &vec_type) { c = vec_type; }; - __host__ __device__ GPUcomplex &operator=(const GPUcomplex &orig) { + HOST_DEVICE GPUcomplex &operator=(const GPUcomplex &orig) { this->c = __float22half2_rn(orig.c); return *this; } - __host__ __device__ GPUcomplex &operator=(const GPUcomplex &orig) { - __half realpart = __double2half(orig.c.x); - __half imagpart = __double2half(orig.c.y); + HOST_DEVICE GPUcomplex &operator=(const GPUcomplex &orig) { + __half realpart = __double2half(orig.cREAL); + __half imagpart = __double2half(orig.cIMAG); this->c = __halves2half2(realpart, imagpart); return *this; } - __host__ __device__ GPUcomplex &operator=(const GPUcomplex<__half> orig) { + HOST_DEVICE GPUcomplex &operator=(const GPUcomplex<__half> orig) { this->c = static_cast<__half2>(orig.c); return *this; } - __host__ __device__ GPUcomplex &operator=(const __half &orig) { - this->c.x = orig; - this->c.y = 0.0f; + HOST_DEVICE GPUcomplex &operator=(const __half &orig) { + this->cREAL = orig; + this->cIMAG = 0.0f; return *this; } - __host__ __device__ GPUcomplex &operator+=(const __half &op) { - this->c.x += op; + HOST_DEVICE GPUcomplex &operator+=(const __half &op) { + this->cREAL += op; return *this; } - __host__ __device__ GPUcomplex &operator+=(const GPUcomplex &op) { + HOST_DEVICE GPUcomplex &operator+=(const GPUcomplex &op) { this->c += op.c; return *this; } - __host__ __device__ GPUcomplex &operator-=(const GPUcomplex &op) { + HOST_DEVICE GPUcomplex &operator-=(const GPUcomplex &op) { this->c -= op.c; return *this; } - __host__ __device__ GPUcomplex &operator*=(const GPUcomplex &op) { + HOST_DEVICE GPUcomplex &operator*=(const GPUcomplex &op) { - const __half2 a_re = __half2half2(this->c.x); + const __half2 a_re = __half2half2(this->cREAL); __half2 acc = __hfma2(a_re, op.c, __float2half2_rn(0.0)); - const __half2 a_im = __half2half2(this->c.y); - const __half2 ib = __halves2half2(__hneg(op.c.y), op.c.x); + const __half2 a_im = __half2half2(this->cIMAG); + const __half2 ib = __halves2half2(__hneg(op.cIMAG), op.cREAL); acc = __hfma2(a_im, ib, acc); // __half2 result = __hcmadd( this->c , op.c , __float2half2_rn ( // 0.0 ) ); @@ -335,59 +339,59 @@ template <> class GPUcomplex<__half> { return *this; } - __host__ __device__ GPUcomplex &operator*=(const __half &op) { + HOST_DEVICE GPUcomplex &operator*=(const __half &op) { __half2 temp = __half2half2(op); this->c *= temp; return *this; } - __host__ __device__ GPUcomplex &operator/=(const __half &op) { + HOST_DEVICE GPUcomplex &operator/=(const __half &op) { __half2 temp = __half2half2(op); this->c /= temp; return *this; } - __host__ __device__ friend GPUcomplex operator+(const GPUcomplex left, + HOST_DEVICE friend GPUcomplex operator+(const GPUcomplex left, const GPUcomplex right) { return GPUcomplex(left.c + right.c); } - __host__ __device__ friend GPUcomplex operator+(const GPUcomplex &left, + HOST_DEVICE friend GPUcomplex operator+(const GPUcomplex &left, const __half &right) { - return GPUcomplex(left.c.x + right, left.c.y); + return GPUcomplex(left.cREAL + right, left.cIMAG); } - __host__ __device__ friend GPUcomplex operator+(const __half &left, + HOST_DEVICE friend GPUcomplex operator+(const __half &left, const GPUcomplex &right) { - return GPUcomplex(left + right.c.x, right.c.y); + return GPUcomplex(left + right.cREAL, right.cIMAG); } - __host__ __device__ friend GPUcomplex operator-(const GPUcomplex &op) { + HOST_DEVICE friend GPUcomplex operator-(const GPUcomplex &op) { return GPUcomplex(-op.c); } - __host__ __device__ friend GPUcomplex operator-(const GPUcomplex &left, + HOST_DEVICE friend GPUcomplex operator-(const GPUcomplex &left, const GPUcomplex &right) { return GPUcomplex(left.c - right.c); } - __host__ __device__ friend GPUcomplex operator-(const GPUcomplex &left, + HOST_DEVICE friend GPUcomplex operator-(const GPUcomplex &left, const __half &right) { - return GPUcomplex(left.c.x - right, left.c.y); + return GPUcomplex(left.cREAL - right, left.cIMAG); } - __host__ __device__ friend GPUcomplex operator-(const __half &left, + HOST_DEVICE friend GPUcomplex operator-(const __half &left, const GPUcomplex &right) { - return GPUcomplex(left - right.c.x, -right.c.y); + return GPUcomplex(left - right.cREAL, -right.cIMAG); } - __host__ __device__ friend GPUcomplex operator*(const GPUcomplex &left, + HOST_DEVICE friend GPUcomplex operator*(const GPUcomplex &left, const GPUcomplex &right) { - const __half2 a_re = __half2half2(left.c.x); + const __half2 a_re = __half2half2(left.cREAL); __half2 acc = __hfma2(a_re, right.c, __float2half2_rn(0.0)); - const __half2 a_im = __half2half2(left.c.y); - const __half2 ib = __halves2half2(__hneg(right.c.y), right.c.x); + const __half2 a_im = __half2half2(left.cIMAG); + const __half2 ib = __halves2half2(__hneg(right.cIMAG), right.cREAL); acc = __hfma2(a_im, ib, acc); // __half2 result = __hcmadd( left.c , right.c , __float2half2_rn // ( 0.0 ) ); @@ -395,58 +399,58 @@ template <> class GPUcomplex<__half> { return GPUcomplex(acc); } - __host__ __device__ friend GPUcomplex operator*(const GPUcomplex &left, + HOST_DEVICE friend GPUcomplex operator*(const GPUcomplex &left, const __half &right) { - return GPUcomplex(left.c.x * right, left.c.y * right); + return GPUcomplex(left.cREAL * right, left.cIMAG * right); } - __host__ __device__ friend GPUcomplex operator*(const __half &left, + HOST_DEVICE friend GPUcomplex operator*(const __half &left, const GPUcomplex &right) { - return GPUcomplex(left * right.c.x, left * right.c.y); + return GPUcomplex(left * right.cREAL, left * right.cIMAG); } - __host__ __device__ friend GPUcomplex + HOST_DEVICE friend GPUcomplex fma(const GPUcomplex &a, const GPUcomplex &b, const GPUcomplex &d) { - const __half2 a_re = __half2half2(a.c.x); + const __half2 a_re = __half2half2(a.cREAL); __half2 acc = __hfma2(a_re, b.c, d.c); - const __half2 a_im = __half2half2(a.c.y); - const __half2 ib = __halves2half2(__hneg(b.c.y), b.c.x); + const __half2 a_im = __half2half2(a.cIMAG); + const __half2 ib = __halves2half2(__hneg(b.cIMAG), b.cREAL); acc = __hfma2(a_im, ib, acc); // return GPUcomplex( __hcmadd( x.c, y.c, d.c ) ); return GPUcomplex(acc); } - __host__ __device__ friend GPUcomplex fma(const __half x, const GPUcomplex &y, + HOST_DEVICE friend GPUcomplex fma(const __half x, const GPUcomplex &y, const GPUcomplex &d) { __half2 xh2 = __half2half2(x); return GPUcomplex(__hfma2(xh2, y.c, d.c)); } - __host__ __device__ void addProduct(const GPUcomplex &a, + HOST_DEVICE void addProduct(const GPUcomplex &a, const GPUcomplex &b) { - const __half2 a_re = __half2half2(a.c.x); + const __half2 a_re = __half2half2(a.cREAL); __half2 acc = __hfma2(a_re, b.c, this->c); - const __half2 a_im = __half2half2(a.c.y); - const __half2 ib = __halves2half2(__hneg(b.c.y), b.c.x); + const __half2 a_im = __half2half2(a.cIMAG); + const __half2 ib = __halves2half2(__hneg(b.cIMAG), b.cREAL); acc = __hfma2(a_im, ib, acc); this->c = acc; // this->c = __hcmadd( x.c, y.c, this->c ); return; } - __host__ __device__ void addProduct(const __half &x, const GPUcomplex &y) { + HOST_DEVICE void addProduct(const __half &x, const GPUcomplex &y) { __half2 xh2 = __half2half2(x); this->c = __hfma2(xh2, y.c, this->c); return; } template - __host__ __device__ friend GPUcomplex operator/(const GPUcomplex &left, + HOST_DEVICE friend GPUcomplex operator/(const GPUcomplex &left, const T &right) { - return GPUcomplex(left.c.x / right, left.c.y / right); + return GPUcomplex(left.cREAL / right, left.cIMAG / right); } - __host__ __device__ friend GPUcomplex operator/(const GPUcomplex &left, + HOST_DEVICE friend GPUcomplex operator/(const GPUcomplex &left, const __half &right) { __half2 right2 = __half2half2(right); @@ -454,19 +458,19 @@ template <> class GPUcomplex<__half> { } template - __host__ __device__ friend GPUcomplex operator/(const T &left, + HOST_DEVICE friend GPUcomplex operator/(const T &left, const GPUcomplex &right) { return GPUcomplex( - left * right.c.x / (right.c.x * right.c.x + right.c.y * right.c.y), - -left * right.c.y / (right.c.x * right.c.x + right.c.y * right.c.y)); + left * right.cREAL / (right.cREAL * right.cREAL + right.cIMAG * right.cIMAG), + -left * right.cIMAG / (right.cREAL * right.cREAL + right.cIMAG * right.cIMAG)); } - __host__ __device__ inline static GPUcomplex invalid(); + HOST_DEVICE inline static GPUcomplex invalid(); - __host__ __device__ GPUcomplex getAccessor() const { return *this; } + HOST_DEVICE GPUcomplex getAccessor() const { return *this; } template - __host__ __device__ GPUcomplex operator()(const Index) const { + HOST_DEVICE GPUcomplex operator()(const Index) const { return *this; } }; @@ -474,46 +478,46 @@ template <> class GPUcomplex<__half> { #endif template -__host__ __device__ inline floatT real(const GPUcomplex &op) { - return op.c.x; +HOST_DEVICE inline floatT real(const GPUcomplex &op) { + return op.cREAL; } template -__host__ __device__ inline floatT imag(const GPUcomplex &op) { - return op.c.y; +HOST_DEVICE inline floatT imag(const GPUcomplex &op) { + return op.cIMAG; } template -__host__ __device__ inline floatT abs(const GPUcomplex &op) { - floatT square = op.c.x * op.c.x + op.c.y * op.c.y; +HOST_DEVICE inline floatT abs(const GPUcomplex &op) { + floatT square = op.cREAL * op.cREAL + op.cIMAG * op.cIMAG; return sqrtf(square); } template -__host__ __device__ inline floatT abs2(const GPUcomplex &op) { - return op.c.x * op.c.x + op.c.y * op.c.y; +HOST_DEVICE inline floatT abs2(const GPUcomplex &op) { + return op.cREAL * op.cREAL + op.cIMAG * op.cIMAG; } template -__host__ __device__ inline GPUcomplex +HOST_DEVICE inline GPUcomplex conj(const GPUcomplex &op) { - return GPUcomplex(op.c.x, -op.c.y); + return GPUcomplex(op.cREAL, -op.cIMAG); } template -__host__ __device__ inline floatT arg(const GPUcomplex &op) { - return atan2(op.c.y, op.c.x); +HOST_DEVICE inline floatT arg(const GPUcomplex &op) { + return atan2(op.cIMAG, op.cREAL); } template -__host__ __device__ inline GPUcomplex +HOST_DEVICE inline GPUcomplex cupow(const GPUcomplex &base, const floatT &exp) { return GPUcomplex(pow(abs(base), exp) * cos(arg(base) * exp), pow(abs(base), exp) * sin(arg(base) * exp)); } template -__host__ __device__ inline GPUcomplex +HOST_DEVICE inline GPUcomplex cusqrt(const GPUcomplex &base) { return GPUcomplex(sqrt(abs(base)) * cos(arg(base) * 0.5), sqrt(abs(base)) * sin(arg(base) * 0.5)); @@ -523,19 +527,19 @@ template const GPUcomplex GPUcomplex_invalid(nanf(" "), nanf(" ")); template -__host__ inline std::ostream &operator<<(std::ostream &s, +HOST inline std::ostream &operator<<(std::ostream &s, GPUcomplex z) { return s << '(' << real(z) << ',' << imag(z) << ')'; } template -__host__ __device__ inline GPUcomplex +HOST_DEVICE inline GPUcomplex GPUcomplex::invalid() { return GPUcomplex_invalid; } template -__device__ __host__ inline bool +HOST_DEVICE inline bool compareGCOMPLEX(GPUcomplex a, GPUcomplex b, floatT tol) { floatT diffRe = abs(real(a) - real(b)); floatT diffIm = abs(imag(a) - imag(b)); diff --git a/src/base/math/generalAccessor.h b/src/base/math/generalAccessor.h index af21198b..7334f1b2 100644 --- a/src/base/math/generalAccessor.h +++ b/src/base/math/generalAccessor.h @@ -30,12 +30,12 @@ class GeneralAccessor { public: template - __host__ __device__ inline object_memory getElementEntry(const size_t idx) const { + HOST_DEVICE inline object_memory getElementEntry(const size_t idx) const { return (_elements[elem][idx]); } template - __host__ __device__ inline void setElementEntry(const size_t idx, object_memory entry) { + HOST_DEVICE inline void setElementEntry(const size_t idx, object_memory entry) { _elements[elem][idx] = static_cast(entry); } @@ -46,7 +46,7 @@ class GeneralAccessor { } /// Constructor for one memory chunk, where all entries are separated by object_count - __host__ __device__ explicit GeneralAccessor(object_memory *elementsBase, size_t object_count) { + HOST_DEVICE explicit GeneralAccessor(object_memory *elementsBase, size_t object_count) { for (size_t i = 0; i < Nentries; i++) { _elements[i] = elementsBase + i * object_count; } diff --git a/src/base/math/grnd.cpp b/src/base/math/grnd.cpp index 757c3c3d..b8a0f815 100644 --- a/src/base/math/grnd.cpp +++ b/src/base/math/grnd.cpp @@ -99,7 +99,7 @@ void grnd_state::make_rng_state(unsigned int seed){ while ( ( aux_z = lrand48() ) <= 128 ) {}; aux_w = lrand48(); - uint4 dummy=make_uint4(aux_x, aux_y, aux_z, aux_w); + uint4 dummy = {static_cast(aux_x), static_cast(aux_y), static_cast(aux_z), static_cast(aux_w)}; //This has to be here, because we need global coordinates! int x, y, z, t; @@ -151,7 +151,7 @@ void grnd_state::make_rng_state(unsigned int seed){ template -__host__ __device__ uint4* grnd_state::getElement(gSite site){ +HOST_DEVICE uint4* grnd_state::getElement(gSite site){ return &state[site.isite]; } diff --git a/src/base/math/grnd.h b/src/base/math/grnd.h index adf19c88..2711b58c 100644 --- a/src/base/math/grnd.h +++ b/src/base/math/grnd.h @@ -9,6 +9,7 @@ #include #endif + #include "../../define.h" #include "../gutils.h" #include "../IO/misc.h" @@ -32,20 +33,20 @@ -template __host__ __device__ inline floatT minVal(); -template<> __host__ __device__ inline float minVal(){ return FLT_MIN; } -template<> __host__ __device__ inline double minVal(){ return DBL_MIN; } +template HOST_DEVICE inline floatT minVal(); +template<> HOST_DEVICE inline float minVal(){ return FLT_MIN; } +template<> HOST_DEVICE inline double minVal(){ return DBL_MIN; } /** * internal functions, should only be called from get_rand! */ -__device__ __host__ inline unsigned taus_step( unsigned &z, int S1, int S2, int S3, unsigned M) +HOST_DEVICE inline unsigned taus_step( unsigned &z, int S1, int S2, int S3, unsigned M) { unsigned b=((((z<>S2); return z=((((z &M)< -__device__ __host__ inline floatT get_rand(uint4* state) +HOST_DEVICE inline floatT get_rand(uint4* state) { return 2.3283064365386963e-10*( taus_step( state->x, 13, 19, 12, 4294967294ul)^ taus_step( state->y, 2, 25, 4, 4294967288ul)^ @@ -63,7 +64,7 @@ __device__ __host__ inline floatT get_rand(uint4* state) /// A random variable in (0,1]. template -__device__ __host__ inline floatT get_rand_excl0(uint4* state) +HOST_DEVICE inline floatT get_rand_excl0(uint4* state) { floatT xR = get_rand(state); return xR + (1.0-xR)*minVal(); @@ -167,7 +168,7 @@ class grnd_state ~grnd_state(){} void make_rng_state(unsigned int seed); - __host__ __device__ uint4* getElement(gSite site); + HOST_DEVICE uint4* getElement(gSite site); gMemoryPtr& getMemPtr(){ return memory; diff --git a/src/base/math/gsu2.h b/src/base/math/gsu2.h index 9494fa59..9a11f787 100644 --- a/src/base/math/gsu2.h +++ b/src/base/math/gsu2.h @@ -16,40 +16,40 @@ template class GSU2 { public: - __device__ __host__ GSU2() { }; + HOST_DEVICE GSU2() { }; GCOMPLEX(floatT) _e11,_e12; - __device__ __host__ GSU2(GCOMPLEX(floatT) e11, GCOMPLEX(floatT) e12) : _e11(e11), _e12(e12) {} + HOST_DEVICE GSU2(GCOMPLEX(floatT) e11, GCOMPLEX(floatT) e12) : _e11(e11), _e12(e12) {} - __device__ __host__ friend GSU2 operator+(const GSU2 &x,const GSU2 &y) { + HOST_DEVICE friend GSU2 operator+(const GSU2 &x,const GSU2 &y) { return GSU2 (x._e11+y._e11,x._e12+y._e12); } - __device__ __host__ friend GSU2 operator-(const GSU2 &x,const GSU2 &y) { + HOST_DEVICE friend GSU2 operator-(const GSU2 &x,const GSU2 &y) { return GSU2 (x._e11-y._e11,x._e12-y._e12); } - __device__ __host__ friend GSU2 operator*(const GSU2 &x,const GCOMPLEX(floatT) &y) { + HOST_DEVICE friend GSU2 operator*(const GSU2 &x,const GCOMPLEX(floatT) &y) { return GSU2 (x._e11*y,x._e12*y); } - __device__ __host__ friend GSU2 operator*(const GCOMPLEX(floatT) &x,const GSU2 &y) { + HOST_DEVICE friend GSU2 operator*(const GCOMPLEX(floatT) &x,const GSU2 &y) { return GSU2 (x*y._e11,x*y._e12); } - __device__ __host__ friend GSU2 operator*(const GSU2 &x,const floatT &y) { + HOST_DEVICE friend GSU2 operator*(const GSU2 &x,const floatT &y) { return GSU2 (x._e11*y,x._e12*y); } - __device__ __host__ friend GSU2 operator*(const floatT &x,const GSU2 &y) { + HOST_DEVICE friend GSU2 operator*(const floatT &x,const GSU2 &y) { return GSU2 (x*y._e11,x*y._e12); } - __device__ __host__ friend GSU2 operator/(const GSU2 &x,const floatT &y) { + HOST_DEVICE friend GSU2 operator/(const GSU2 &x,const floatT &y) { return GSU2 (x._e11/y,x._e12/y); } - __device__ __host__ friend GSU2 operator*(const GSU2 &x,const GSU2 &y) { + HOST_DEVICE friend GSU2 operator*(const GSU2 &x,const GSU2 &y) { GCOMPLEX(floatT) tmp1,tmp2; tmp1=y._e12; tmp2=y._e11; @@ -58,48 +58,48 @@ class GSU2 { return GSU2 (tmp1,tmp2); } - __device__ __host__ GSU2 &operator =(const GSU2 &y) { + HOST_DEVICE GSU2 &operator =(const GSU2 &y) { _e11=y._e11; _e12=y._e12; return *this; } - __device__ __host__ GSU2 &operator+=(const GSU2 &y) { + HOST_DEVICE GSU2 &operator+=(const GSU2 &y) { _e11+=y._e11; _e12+=y._e12; return *this; } - __device__ __host__ GSU2 &operator-=(const GSU2 &y) { + HOST_DEVICE GSU2 &operator-=(const GSU2 &y) { _e11-=y._e11; _e12-=y._e12; return *this; } - __device__ __host__ GSU2 &operator*=(const GSU2 &y) { + HOST_DEVICE GSU2 &operator*=(const GSU2 &y) { *this=*this*y; return *this; } - __device__ __host__ GSU2 &operator*=(const GCOMPLEX(floatT) &y) { + HOST_DEVICE GSU2 &operator*=(const GCOMPLEX(floatT) &y) { _e11*=y; _e12*=y; return *this; } - __device__ __host__ GSU2 &operator*=(const floatT &y) { + HOST_DEVICE GSU2 &operator*=(const floatT &y) { *this=*this*y; return *this; } - __device__ __host__ GSU2 &operator/=(const floatT &y) { + HOST_DEVICE GSU2 &operator/=(const floatT &y) { *this=*this/y; return *this; } - __device__ __host__ floatT tr2() { + HOST_DEVICE floatT tr2() { return( real(_e11) ); } - __device__ __host__ GCOMPLEX(floatT) det() { + HOST_DEVICE GCOMPLEX(floatT) det() { return( real(_e11) ); } - __device__ __host__ void unitarize() { + HOST_DEVICE void unitarize() { floatT res; res = real(_e11)*real(_e11) + imag(_e11)*imag(_e11) + @@ -110,7 +110,7 @@ class GSU2 { _e12=_e12*res; } - __device__ __host__ GSU2 dagger() const { + HOST_DEVICE GSU2 dagger() const { GSU2 tmp; tmp._e11 = conj(_e11); @@ -119,7 +119,7 @@ class GSU2 { return tmp; } - __device__ __host__ floatT norm2() const { + HOST_DEVICE floatT norm2() const { return (real(_e11)*real(_e11) + real(_e12)*real(_e12) + imag(_e11)*imag(_e11) + imag(_e12)*imag(_e12)); } @@ -133,7 +133,7 @@ class GSU2 { }; template -__device__ __host__ inline GSU2 dagger(const GSU2 &x) { +HOST_DEVICE inline GSU2 dagger(const GSU2 &x) { GSU2 tmp; tmp._e11 = conj(x._e11); tmp._e12 = - x._e12; @@ -141,13 +141,13 @@ __device__ __host__ inline GSU2 dagger(const GSU2 &x) { } template -__device__ __host__ inline floatT norm2(const GSU2 &x) { +HOST_DEVICE inline floatT norm2(const GSU2 &x) { return ( real(x._e11)*real(x._e11) + real(x._e12)*real(x._e12) + imag(x._e11)*imag(x._e11) + imag(x._e12)*imag(x._e12) ); } template -__device__ __host__ inline GSU2 sub12 (const GSU3 &u, const GSU3 &v) { +HOST_DEVICE inline GSU2 sub12 (const GSU3 &u, const GSU3 &v) { GCOMPLEX(floatT) r00,r01,r10,r11; r00 = u.getLink00()*v.getLink00() + u.getLink01()*v.getLink10() + u.getLink02()*v.getLink20(); @@ -159,7 +159,7 @@ __device__ __host__ inline GSU2 sub12 (const GSU3 &u, const GSU } template -__device__ __host__ inline GSU2 sub13(const GSU3 &u, const GSU3 &v) { +HOST_DEVICE inline GSU2 sub13(const GSU3 &u, const GSU3 &v) { GCOMPLEX(floatT) r00,r01,r10,r11; r00 = u.getLink00()*v.getLink00() + u.getLink01()*v.getLink10() + u.getLink02()*v.getLink20(); @@ -171,7 +171,7 @@ __device__ __host__ inline GSU2 sub13(const GSU3 &u, const GSU3< } template -__device__ __host__ inline GSU2 sub23(const GSU3 &u, const GSU3 &v) { +HOST_DEVICE inline GSU2 sub23(const GSU3 &u, const GSU3 &v) { GCOMPLEX(floatT) r00,r01,r10,r11; r00 = u.getLink10()*v.getLink01() + u.getLink11()*v.getLink11() + u.getLink12()*v.getLink21(); @@ -183,7 +183,7 @@ __device__ __host__ inline GSU2 sub23(const GSU3 &u, const GSU3< } template -__device__ __host__ inline GSU3 sub12(const GSU2 &u, +HOST_DEVICE inline GSU3 sub12(const GSU2 &u, const GSU3 &v) { return GSU3 (u._e11 *v.getLink00() + u._e12 *v.getLink10(), u._e11 *v.getLink01() + u._e12 *v.getLink11(), @@ -197,7 +197,7 @@ __device__ __host__ inline GSU3 sub12(const GSU2 &u, } template -__device__ __host__ inline GSU3 sub13(const GSU2 &u, const GSU3 &v) { +HOST_DEVICE inline GSU3 sub13(const GSU2 &u, const GSU3 &v) { return GSU3 (u._e11 *v.getLink00() + u._e12 *v.getLink20(), u._e11 *v.getLink01() + u._e12 *v.getLink21(), u._e11 *v.getLink02() + u._e12 *v.getLink22(), @@ -210,7 +210,7 @@ __device__ __host__ inline GSU3 sub13(const GSU2 &u, const GSU3< } template -__device__ __host__ inline GSU3 sub23(const GSU2 &u, const GSU3 &v) { +HOST_DEVICE inline GSU3 sub23(const GSU2 &u, const GSU3 &v) { return GSU3 (v.getLink00(), v.getLink01(), v.getLink02(), @@ -223,7 +223,7 @@ __device__ __host__ inline GSU3 sub23(const GSU2 &u, const GSU3< } template -__device__ __host__ inline floatT realtrace(const GSU3 &x) { +HOST_DEVICE inline floatT realtrace(const GSU3 &x) { return ( real(x.getLink00() + x.getLink11() + x.getLink22()) ); } diff --git a/src/base/math/gsu3.h b/src/base/math/gsu3.h index 4db954e1..2e78ff4c 100644 --- a/src/base/math/gsu3.h +++ b/src/base/math/gsu3.h @@ -8,6 +8,7 @@ #ifndef _gsu3_h_ #define _gsu3_h_ + #include "../../define.h" #include "gcomplex.h" #include "gvect3.h" @@ -23,73 +24,73 @@ template class GSU3; template -__host__ std::ostream &operator<<(std::ostream &, const GSU3 &); +HOST std::ostream &operator<<(std::ostream &, const GSU3 &); template -__host__ std::istream &operator>>(std::istream &, GSU3 &); +HOST std::istream &operator>>(std::istream &, GSU3 &); template -__device__ __host__ inline GSU3 operator+(const GSU3 &, const GSU3 &); +HOST_DEVICE inline GSU3 operator+(const GSU3 &, const GSU3 &); template -__device__ __host__ inline GSU3 operator-(const GSU3 &, const GSU3 &); +HOST_DEVICE inline GSU3 operator-(const GSU3 &, const GSU3 &); template -__device__ __host__ inline GSU3 operator*(const GCOMPLEX(floatT) &, const GSU3 &); +HOST_DEVICE inline GSU3 operator*(const GCOMPLEX(floatT) &, const GSU3 &); template -__device__ __host__ inline GSU3 operator*(const GSU3 &, const GCOMPLEX(floatT) &); +HOST_DEVICE inline GSU3 operator*(const GSU3 &, const GCOMPLEX(floatT) &); template -__device__ __host__ inline GSU3 operator*(const floatT &, const GSU3 &); +HOST_DEVICE inline GSU3 operator*(const floatT &, const GSU3 &); template -__device__ __host__ inline GSU3 operator*(const GSU3 &, const floatT &); +HOST_DEVICE inline GSU3 operator*(const GSU3 &, const floatT &); template -__device__ __host__ inline GSU3 operator*(const GSU3 &, const GSU3 &); +HOST_DEVICE inline GSU3 operator*(const GSU3 &, const GSU3 &); template -__device__ __host__ inline GSU3 operator/(const GSU3 &, const floatT &); +HOST_DEVICE inline GSU3 operator/(const GSU3 &, const floatT &); template -__device__ __host__ floatT tr_d(const GSU3 &); +HOST_DEVICE floatT tr_d(const GSU3 &); template -__device__ __host__ floatT tr_i(const GSU3 &); +HOST_DEVICE floatT tr_i(const GSU3 &); template -__device__ __host__ floatT tr_d(const GSU3 &, const GSU3 &); +HOST_DEVICE floatT tr_d(const GSU3 &, const GSU3 &); template -__device__ __host__ GCOMPLEX(floatT) tr_c(const GSU3 &); +HOST_DEVICE GCOMPLEX(floatT) tr_c(const GSU3 &); template -__device__ __host__ GCOMPLEX(floatT) tr_c(const GSU3 &, const GSU3 &); +HOST_DEVICE GCOMPLEX(floatT) tr_c(const GSU3 &, const GSU3 &); template -__device__ __host__ GSU3 dagger(const GSU3 &); +HOST_DEVICE GSU3 dagger(const GSU3 &); template -__device__ __host__ GCOMPLEX(floatT) det(const GSU3 &X); +HOST_DEVICE GCOMPLEX(floatT) det(const GSU3 &X); template -__device__ __host__ floatT realdet(const GSU3 &X); +HOST_DEVICE floatT realdet(const GSU3 &X); template -__device__ __host__ floatT infnorm(const GSU3 &X); +HOST_DEVICE floatT infnorm(const GSU3 &X); template -__device__ __host__ GSU3 su3_exp(GSU3); +HOST_DEVICE GSU3 su3_exp(GSU3); template -__device__ __host__ gVect3 operator*(const GSU3 &, const gVect3 &); +HOST_DEVICE gVect3 operator*(const GSU3 &, const gVect3 &); template -__device__ __host__ GSU3 tensor_prod(const gVect3 &, const gVect3 &); +HOST_DEVICE GSU3 tensor_prod(const gVect3 &, const gVect3 &); template -__device__ __host__ inline bool compareGSU3(GSU3 a, GSU3 b, floatT tol=1e-13); +HOST_DEVICE inline bool compareGSU3(GSU3 a, GSU3 b, floatT tol=1e-13); template class GSU3 { @@ -101,9 +102,9 @@ class GSU3 { public: constexpr GSU3(const GSU3&) = default; - __host__ __device__ GSU3() {}; + HOST_DEVICE GSU3() {}; - __host__ __device__ GSU3(const floatT x) { + HOST_DEVICE GSU3(const floatT x) { _e00 = x; _e01 = x; _e02 = x; @@ -115,7 +116,7 @@ class GSU3 { _e22 = x; }; - __host__ __device__ GSU3(GCOMPLEX(floatT) e00, GCOMPLEX(floatT) e01, GCOMPLEX(floatT) e02, + HOST_DEVICE GSU3(GCOMPLEX(floatT) e00, GCOMPLEX(floatT) e01, GCOMPLEX(floatT) e02, GCOMPLEX(floatT) e10, GCOMPLEX(floatT) e11, GCOMPLEX(floatT) e12, GCOMPLEX(floatT) e20, GCOMPLEX(floatT) e21, GCOMPLEX(floatT) e22) : _e00(e00), _e01(e01), _e02(e02), @@ -125,48 +126,48 @@ class GSU3 { #if (!defined __GPUCC__) - __host__ friend std::ostream& operator<< <> (std::ostream&, const GSU3 &); + HOST friend std::ostream& operator<< <> (std::ostream&, const GSU3 &); #endif - __host__ friend std::istream &operator>><>(std::istream &, GSU3 &); + HOST friend std::istream &operator>><>(std::istream &, GSU3 &); // matrix operations - __device__ __host__ friend GSU3 operator+<>(const GSU3 &, const GSU3 &); + HOST_DEVICE friend GSU3 operator+<>(const GSU3 &, const GSU3 &); - __device__ __host__ friend GSU3 operator-<>(const GSU3 &, const GSU3 &); + HOST_DEVICE friend GSU3 operator-<>(const GSU3 &, const GSU3 &); - __device__ __host__ friend GSU3 operator*<>(const GCOMPLEX(floatT) &x, const GSU3 &y); + HOST_DEVICE friend GSU3 operator*<>(const GCOMPLEX(floatT) &x, const GSU3 &y); - __device__ __host__ friend GSU3 operator*<>(const GSU3 &x, const GCOMPLEX(floatT) &y); + HOST_DEVICE friend GSU3 operator*<>(const GSU3 &x, const GCOMPLEX(floatT) &y); - __device__ __host__ friend GSU3 operator*<>(const floatT &x, const GSU3 &y); + HOST_DEVICE friend GSU3 operator*<>(const floatT &x, const GSU3 &y); - __device__ __host__ friend GSU3 operator*<>(const GSU3 &x, const floatT &y); + HOST_DEVICE friend GSU3 operator*<>(const GSU3 &x, const floatT &y); - __device__ __host__ friend GSU3 operator*<>(const GSU3 &, const GSU3 &); + HOST_DEVICE friend GSU3 operator*<>(const GSU3 &, const GSU3 &); - __device__ __host__ friend GSU3 operator/<>(const GSU3 &x, const floatT &y); + HOST_DEVICE friend GSU3 operator/<>(const GSU3 &x, const floatT &y); - __device__ __host__ bool operator==(const GSU3 &); + HOST_DEVICE bool operator==(const GSU3 &); - __device__ __host__ GSU3 &operator=(const GSU3 &); + HOST_DEVICE GSU3 &operator=(const GSU3 &); - __device__ __host__ GSU3 &operator+=(const GSU3 &); + HOST_DEVICE GSU3 &operator+=(const GSU3 &); - __device__ __host__ GSU3 &operator-=(const GSU3 &); + HOST_DEVICE GSU3 &operator-=(const GSU3 &); - __device__ __host__ GSU3 &operator*=(const floatT &); + HOST_DEVICE GSU3 &operator*=(const floatT &); - __device__ __host__ GSU3 &operator*=(const GCOMPLEX(floatT) &); + HOST_DEVICE GSU3 &operator*=(const GCOMPLEX(floatT) &); - __device__ __host__ GSU3 &operator*=(const GSU3 &); + HOST_DEVICE GSU3 &operator*=(const GSU3 &); - __device__ __host__ GSU3 &operator/=(const floatT &); + HOST_DEVICE GSU3 &operator/=(const floatT &); // cast operations single <-> double precision template - __device__ __host__ inline operator GSU3() const { + HOST_DEVICE inline operator GSU3() const { return GSU3(GCOMPLEX(T)(_e00.cREAL, _e00.cIMAG), GCOMPLEX(T)(_e01.cREAL, _e01.cIMAG), GCOMPLEX(T)(_e02.cREAL, _e02.cIMAG), GCOMPLEX(T)(_e10.cREAL, _e10.cIMAG), GCOMPLEX(T)(_e11.cREAL, _e11.cIMAG), @@ -176,18 +177,18 @@ class GSU3 { } - __device__ __host__ friend gVect3 + HOST_DEVICE friend gVect3 operator*<>(const GSU3 &, const gVect3 &); // GSU3 * cvect3 multiplication - __device__ __host__ friend GSU3 + HOST_DEVICE friend GSU3 tensor_prod<>(const gVect3 &, const gVect3 &); // tensor product of two cvect3 - __device__ __host__ friend bool + HOST_DEVICE friend bool compareGSU3<>(GSU3 a, GSU3 b, floatT tol); - __device__ __host__ void random(uint4 *state); // set links randomly - __device__ __host__ void gauss(uint4 *state); // set links gauss - __device__ __host__ void su3unitarize(); // project to su3 using first two rows of link - __device__ __host__ void su3reconstruct12() // project to su3 using first two rows of link + HOST_DEVICE void random(uint4 *state); // set links randomly + HOST_DEVICE void gauss(uint4 *state); // set links gauss + HOST_DEVICE void su3unitarize(); // project to su3 using first two rows of link + HOST_DEVICE void su3reconstruct12() // project to su3 using first two rows of link { _e20 = GCOMPLEX(floatT)((_e01.cREAL * _e12.cREAL - _e01.cIMAG * _e12.cIMAG - (_e02.cREAL * _e11.cREAL - _e02.cIMAG * _e11.cIMAG)), @@ -205,7 +206,7 @@ class GSU3 { + (_e01.cIMAG * _e10.cREAL + _e01.cREAL * _e10.cIMAG))); } - __device__ __host__ void su3reconstruct12Dagger() // project to su3 using first two rows of link + HOST_DEVICE void su3reconstruct12Dagger() // project to su3 using first two rows of link { _e02 = GCOMPLEX(floatT)((_e10.cREAL * _e21.cREAL - _e10.cIMAG * _e21.cIMAG - (_e20.cREAL * _e11.cREAL - _e20.cIMAG * _e11.cIMAG)), @@ -223,7 +224,7 @@ class GSU3 { + (_e10.cIMAG * _e01.cREAL + _e10.cREAL * _e01.cIMAG))); } - __device__ __host__ void u3reconstruct(const GCOMPLEX(floatT) phase) // project to u3 using first two rows of link + HOST_DEVICE void u3reconstruct(const GCOMPLEX(floatT) phase) // project to u3 using first two rows of link { _e20 = GCOMPLEX(floatT)((_e01.cREAL * _e12.cREAL - _e01.cIMAG * _e12.cIMAG @@ -253,7 +254,7 @@ class GSU3 { _e22 *= phase; } - __device__ __host__ void u3reconstructDagger(const GCOMPLEX(floatT) phase) // project to u3 using first two rows of link + HOST_DEVICE void u3reconstructDagger(const GCOMPLEX(floatT) phase) // project to u3 using first two rows of link { _e02 = GCOMPLEX(floatT)((_e10.cREAL * _e21.cREAL - _e10.cIMAG * _e21.cIMAG @@ -281,7 +282,7 @@ class GSU3 { _e22 *= phase; } - __device__ __host__ void reconstruct14(const GCOMPLEX(floatT) det) + HOST_DEVICE void reconstruct14(const GCOMPLEX(floatT) det) { floatT amp = pow(abs(det), 1.0/3.0); GCOMPLEX(floatT) phase = det / abs(det); @@ -306,7 +307,7 @@ class GSU3 { _e22 *= phase/amp; } - __device__ __host__ void reconstruct14Dagger(const GCOMPLEX(floatT) det) + HOST_DEVICE void reconstruct14Dagger(const GCOMPLEX(floatT) det) { floatT amp = pow(abs(det), 1.0/3.0); @@ -330,43 +331,43 @@ class GSU3 { _e12 *= phase/amp; _e22 *= phase/amp; } - __device__ __host__ void TA(); // traceless anti-hermitian of link - __device__ __host__ friend floatT tr_d<>(const GSU3 &); // real part of trace of link - __device__ __host__ friend floatT tr_i<>(const GSU3 &); // imaginary part of trace of link - __device__ __host__ friend floatT + HOST_DEVICE void TA(); // traceless anti-hermitian of link + HOST_DEVICE friend floatT tr_d<>(const GSU3 &); // real part of trace of link + HOST_DEVICE friend floatT tr_i<>(const GSU3 &); // imaginary part of trace of link + HOST_DEVICE friend floatT tr_d<>(const GSU3 &, const GSU3 &); // real part of trace of link*link - __device__ __host__ friend GCOMPLEX(floatT) tr_c<>(const GSU3 &); // trace of link - __device__ __host__ friend GCOMPLEX(floatT) tr_c<>(const GSU3 &, + HOST_DEVICE friend GCOMPLEX(floatT) tr_c<>(const GSU3 &); // trace of link + HOST_DEVICE friend GCOMPLEX(floatT) tr_c<>(const GSU3 &, const GSU3 &); // trace of link*link - __device__ __host__ friend GSU3 + HOST_DEVICE friend GSU3 dagger<>(const GSU3 &); // hermitian conjugate - __device__ __host__ friend GSU3 su3_exp<>(GSU3); // exp( link ) - __device__ __host__ friend GCOMPLEX(floatT) det<>(const GSU3 &); - __device__ __host__ friend floatT infnorm<>(const GSU3 &); + HOST_DEVICE friend GSU3 su3_exp<>(GSU3); // exp( link ) + HOST_DEVICE friend GCOMPLEX(floatT) det<>(const GSU3 &); + HOST_DEVICE friend floatT infnorm<>(const GSU3 &); // accessors - __host__ __device__ inline GCOMPLEX(floatT) getLink00() const; - __host__ __device__ inline GCOMPLEX(floatT) getLink01() const; - __host__ __device__ inline GCOMPLEX(floatT) getLink02() const; - __host__ __device__ inline GCOMPLEX(floatT) getLink10() const; - __host__ __device__ inline GCOMPLEX(floatT) getLink11() const; - __host__ __device__ inline GCOMPLEX(floatT) getLink12() const; - __host__ __device__ inline GCOMPLEX(floatT) getLink20() const; - __host__ __device__ inline GCOMPLEX(floatT) getLink21() const; - __host__ __device__ inline GCOMPLEX(floatT) getLink22() const; + HOST_DEVICE inline GCOMPLEX(floatT) getLink00() const; + HOST_DEVICE inline GCOMPLEX(floatT) getLink01() const; + HOST_DEVICE inline GCOMPLEX(floatT) getLink02() const; + HOST_DEVICE inline GCOMPLEX(floatT) getLink10() const; + HOST_DEVICE inline GCOMPLEX(floatT) getLink11() const; + HOST_DEVICE inline GCOMPLEX(floatT) getLink12() const; + HOST_DEVICE inline GCOMPLEX(floatT) getLink20() const; + HOST_DEVICE inline GCOMPLEX(floatT) getLink21() const; + HOST_DEVICE inline GCOMPLEX(floatT) getLink22() const; // setters - __host__ __device__ inline void setLink00(GCOMPLEX(floatT) x); - __host__ __device__ inline void setLink01(GCOMPLEX(floatT) x); - __host__ __device__ inline void setLink02(GCOMPLEX(floatT) x); - __host__ __device__ inline void setLink10(GCOMPLEX(floatT) x); - __host__ __device__ inline void setLink11(GCOMPLEX(floatT) x); - __host__ __device__ inline void setLink12(GCOMPLEX(floatT) x); - __host__ __device__ inline void setLink20(GCOMPLEX(floatT) x); - __host__ __device__ inline void setLink21(GCOMPLEX(floatT) x); - __host__ __device__ inline void setLink22(GCOMPLEX(floatT) x); - - __host__ __device__ inline GCOMPLEX(floatT) &operator()(int i, int j) { + HOST_DEVICE inline void setLink00(GCOMPLEX(floatT) x); + HOST_DEVICE inline void setLink01(GCOMPLEX(floatT) x); + HOST_DEVICE inline void setLink02(GCOMPLEX(floatT) x); + HOST_DEVICE inline void setLink10(GCOMPLEX(floatT) x); + HOST_DEVICE inline void setLink11(GCOMPLEX(floatT) x); + HOST_DEVICE inline void setLink12(GCOMPLEX(floatT) x); + HOST_DEVICE inline void setLink20(GCOMPLEX(floatT) x); + HOST_DEVICE inline void setLink21(GCOMPLEX(floatT) x); + HOST_DEVICE inline void setLink22(GCOMPLEX(floatT) x); + + HOST_DEVICE inline GCOMPLEX(floatT) &operator()(int i, int j) { switch (i * 3 + j) { case 0: return _e00; @@ -391,7 +392,7 @@ class GSU3 { return _e00; } - __host__ inline const GCOMPLEX(floatT) &operator()(int i, int j) const { + HOST inline const GCOMPLEX(floatT) &operator()(int i, int j) const { switch (i * 3 + j) { case 0: return _e00; @@ -415,119 +416,120 @@ class GSU3 { throw std::runtime_error(stdLogger.fatal("GSU3 access to element (", i, ",", j, ") not possible!")); } - __host__ __device__ GSU3 getAccessor() const { + HOST_DEVICE GSU3 getAccessor() const { return *this; } template - __host__ __device__ GSU3 operator()(const Index) const { + HOST_DEVICE GSU3 operator()(const Index) const { return *this; } }; // accessors template -__host__ __device__ inline GCOMPLEX(floatT) GSU3::getLink00() const { +HOST_DEVICE inline GCOMPLEX(floatT) GSU3::getLink00() const { return _e00; } template -__host__ __device__ inline GCOMPLEX(floatT) GSU3::getLink01() const { +HOST_DEVICE inline GCOMPLEX(floatT) GSU3::getLink01() const { return _e01; } template -__host__ __device__ inline GCOMPLEX(floatT) GSU3::getLink02() const { +HOST_DEVICE inline GCOMPLEX(floatT) GSU3::getLink02() const { return _e02; } template -__host__ __device__ inline GCOMPLEX(floatT) GSU3::getLink10() const { +HOST_DEVICE inline GCOMPLEX(floatT) GSU3::getLink10() const { return _e10; } template -__host__ __device__ inline GCOMPLEX(floatT) GSU3::getLink11() const { +HOST_DEVICE inline GCOMPLEX(floatT) GSU3::getLink11() const { return _e11; } template -__host__ __device__ inline GCOMPLEX(floatT) GSU3::getLink12() const { +HOST_DEVICE inline GCOMPLEX(floatT) GSU3::getLink12() const { return _e12; } template -__host__ __device__ inline GCOMPLEX(floatT) GSU3::getLink20() const { +HOST_DEVICE inline GCOMPLEX(floatT) GSU3::getLink20() const { return _e20; } template -__host__ __device__ inline GCOMPLEX(floatT) GSU3::getLink21() const { +HOST_DEVICE inline GCOMPLEX(floatT) GSU3::getLink21() const { return _e21; } template -__host__ __device__ inline GCOMPLEX(floatT) GSU3::getLink22() const { +HOST_DEVICE inline GCOMPLEX(floatT) GSU3::getLink22() const { return _e22; } // setters template -__host__ __device__ inline void GSU3::setLink00(GCOMPLEX(floatT) x) { +HOST_DEVICE inline void GSU3::setLink00(GCOMPLEX(floatT) x) { _e00 = x; } template -__host__ __device__ inline void GSU3::setLink01(GCOMPLEX(floatT) x) { +HOST_DEVICE inline void GSU3::setLink01(GCOMPLEX(floatT) x) { _e01 = x; } template -__host__ __device__ inline void GSU3::setLink02(GCOMPLEX(floatT) x) { +HOST_DEVICE inline void GSU3::setLink02(GCOMPLEX(floatT) x) { _e02 = x; } template -__host__ __device__ inline void GSU3::setLink10(GCOMPLEX(floatT) x) { +HOST_DEVICE inline void GSU3::setLink10(GCOMPLEX(floatT) x) { _e10 = x; } template -__host__ __device__ inline void GSU3::setLink11(GCOMPLEX(floatT) x) { +HOST_DEVICE inline void GSU3::setLink11(GCOMPLEX(floatT) x) { _e11 = x; } template -__host__ __device__ inline void GSU3::setLink12(GCOMPLEX(floatT) x) { +HOST_DEVICE inline void GSU3::setLink12(GCOMPLEX(floatT) x) { _e12 = x; } template -__host__ __device__ inline void GSU3::setLink20(GCOMPLEX(floatT) x) { +HOST_DEVICE inline void GSU3::setLink20(GCOMPLEX(floatT) x) { _e20 = x; } template -__host__ __device__ inline void GSU3::setLink21(GCOMPLEX(floatT) x) { +HOST_DEVICE inline void GSU3::setLink21(GCOMPLEX(floatT) x) { _e21 = x; } template -__host__ __device__ inline void GSU3::setLink22(GCOMPLEX(floatT) x) { +HOST_DEVICE inline void GSU3::setLink22(GCOMPLEX(floatT) x) { _e22 = x; } // some constant su3 matrices template -__device__ __host__ inline GSU3 gsu3_one() { +HOST_DEVICE inline GSU3 gsu3_one() { return GSU3(1, 0, 0, 0, 1, 0, 0, 0, 1); } +#if ! defined(USE_HIP_AMD) && ! defined(USE_CPU_ONLY) template <> -__device__ __host__ inline GSU3<__half> gsu3_one() { +HOST_DEVICE inline GSU3<__half> gsu3_one() { GPUcomplex<__half> g_one(__float2half(1.0)); GPUcomplex<__half> g_zero(__float2half(0.0)); @@ -535,66 +537,66 @@ __device__ __host__ inline GSU3<__half> gsu3_one() { g_zero, g_one, g_zero, g_zero, g_zero, g_one); } - +#endif template -__device__ __host__ inline GSU3 gsu3_zero() { +HOST_DEVICE inline GSU3 gsu3_zero() { return GSU3(0, 0, 0, 0, 0, 0, 0, 0, 0); } template -__device__ __host__ inline GSU3 glambda_1() { +HOST_DEVICE inline GSU3 glambda_1() { return GSU3(0, 1, 0, 1, 0, 0, 0, 0, 0); } template -__device__ __host__ inline GSU3 glambda_2() { +HOST_DEVICE inline GSU3 glambda_2() { return GSU3(0 , -GCOMPLEX(floatT)(0, 1), 0, GCOMPLEX(floatT)(0, 1), 0 , 0, 0 , 0 , 0); } template -__device__ __host__ inline GSU3 glambda_3() { +HOST_DEVICE inline GSU3 glambda_3() { return GSU3(1, 0 , 0, 0, -1, 0, 0, 0 , 0); } template -__device__ __host__ inline GSU3 glambda_4() { +HOST_DEVICE inline GSU3 glambda_4() { return GSU3(0, 0, 1, 0, 0, 0, 1, 0, 0); } template -__device__ __host__ inline GSU3 glambda_5() { +HOST_DEVICE inline GSU3 glambda_5() { return GSU3(0 , 0, -GCOMPLEX(floatT)(0, 1), 0 , 0, 0, GCOMPLEX(floatT)(0, 1), 0, 0); } template -__device__ __host__ inline GSU3 glambda_6() { +HOST_DEVICE inline GSU3 glambda_6() { return GSU3(0, 0, 0, 0, 0, 1, 0, 1, 0); } template -__device__ __host__ inline GSU3 glambda_7() { +HOST_DEVICE inline GSU3 glambda_7() { return GSU3(0, 0 , 0, 0, 0 , -GCOMPLEX(floatT)(0, 1), 0, GCOMPLEX(floatT)(0, 1), 0); } template -__device__ __host__ inline GSU3 glambda_8() { +HOST_DEVICE inline GSU3 glambda_8() { return GSU3(1 / sqrt(3), 0 , 0, 0 , 1 / sqrt(3), 0, 0 , 0 , -2 / sqrt(3)); @@ -605,7 +607,7 @@ __device__ __host__ inline GSU3 glambda_8() { // matrix operations template -__device__ __host__ GSU3 operator+(const GSU3 &x, const GSU3 &y) { +HOST_DEVICE GSU3 operator+(const GSU3 &x, const GSU3 &y) { return GSU3( x._e00 + y._e00, x._e01 + y._e01, x._e02 + y._e02, x._e10 + y._e10, x._e11 + y._e11, x._e12 + y._e12, @@ -613,7 +615,7 @@ __device__ __host__ GSU3 operator+(const GSU3 &x, const GSU3 -__device__ __host__ GSU3 operator-(const GSU3 &x, const GSU3 &y) { +HOST_DEVICE GSU3 operator-(const GSU3 &x, const GSU3 &y) { return GSU3( x._e00 - y._e00, x._e01 - y._e01, x._e02 - y._e02, x._e10 - y._e10, x._e11 - y._e11, x._e12 - y._e12, @@ -622,7 +624,7 @@ __device__ __host__ GSU3 operator-(const GSU3 &x, const GSU3 -__device__ __host__ GSU3 operator*(const GCOMPLEX(floatT) &x, const GSU3 &y) { +HOST_DEVICE GSU3 operator*(const GCOMPLEX(floatT) &x, const GSU3 &y) { return GSU3( x * y._e00, x * y._e01, x * y._e02, x * y._e10, x * y._e11, x * y._e12, @@ -630,7 +632,7 @@ __device__ __host__ GSU3 operator*(const GCOMPLEX(floatT) &x, const GSU3 } template -__device__ __host__ GSU3 operator*(const GSU3 &x, const GCOMPLEX(floatT) &y) { +HOST_DEVICE GSU3 operator*(const GSU3 &x, const GCOMPLEX(floatT) &y) { return GSU3( x._e00 * y, x._e01 * y, x._e02 * y, x._e10 * y, x._e11 * y, x._e12 * y, @@ -638,7 +640,7 @@ __device__ __host__ GSU3 operator*(const GSU3 &x, const GCOMPLEX } template -__device__ __host__ GSU3 operator*(const floatT &x, const GSU3 &y) { +HOST_DEVICE GSU3 operator*(const floatT &x, const GSU3 &y) { return GSU3( x * y._e00, x * y._e01, x * y._e02, x * y._e10, x * y._e11, x * y._e12, @@ -646,7 +648,7 @@ __device__ __host__ GSU3 operator*(const floatT &x, const GSU3 & } template -__device__ __host__ GSU3 operator*(const GSU3 &x, const floatT &y) { +HOST_DEVICE GSU3 operator*(const GSU3 &x, const floatT &y) { return GSU3( x._e00 * y, x._e01 * y, x._e02 * y, x._e10 * y, x._e11 * y, x._e12 * y, @@ -654,7 +656,7 @@ __device__ __host__ GSU3 operator*(const GSU3 &x, const floatT & } template -__device__ __host__ GSU3 operator/(const GSU3 &x, const floatT &y) { +HOST_DEVICE GSU3 operator/(const GSU3 &x, const floatT &y) { return GSU3( x._e00 / y, x._e01 / y, x._e02 / y, x._e10 / y, x._e11 / y, x._e12 / y, @@ -663,7 +665,7 @@ __device__ __host__ GSU3 operator/(const GSU3 &x, const floatT & template -__device__ __host__ GSU3 operator*(const GSU3 &x, const GSU3 &y) { +HOST_DEVICE GSU3 operator*(const GSU3 &x, const GSU3 &y) { GCOMPLEX(floatT) tmp00, tmp01, tmp02, tmp10, tmp11, tmp12, tmp20, tmp21, tmp22; @@ -686,7 +688,7 @@ __device__ __host__ GSU3 operator*(const GSU3 &x, const GSU3 -__device__ __host__ gVect3 operator*(const GSU3 &x, const gVect3 &y) { +HOST_DEVICE gVect3 operator*(const GSU3 &x, const gVect3 &y) { GCOMPLEX(floatT) tmp0, tmp1, tmp2; tmp0 = x._e00 * y._v0 + x._e01 * y._v1 + x._e02 * y._v2; @@ -698,7 +700,7 @@ __device__ __host__ gVect3 operator*(const GSU3 &x, const gVect3 template -__device__ __host__ inline GSU3 &GSU3::operator=(const GSU3 &y) { +HOST_DEVICE inline GSU3 &GSU3::operator=(const GSU3 &y) { _e00 = y._e00; _e01 = y._e01; _e02 = y._e02; @@ -712,7 +714,7 @@ __device__ __host__ inline GSU3 &GSU3::operator=(const GSU3 -__device__ __host__ GSU3 &GSU3::operator+=(const GSU3 &y) { +HOST_DEVICE GSU3 &GSU3::operator+=(const GSU3 &y) { _e00 += y._e00; _e01 += y._e01; _e02 += y._e02; @@ -726,7 +728,7 @@ __device__ __host__ GSU3 &GSU3::operator+=(const GSU3 &y } template -__device__ __host__ GSU3 &GSU3::operator-=(const GSU3 &y) { +HOST_DEVICE GSU3 &GSU3::operator-=(const GSU3 &y) { _e00 -= y._e00; _e01 -= y._e01; _e02 -= y._e02; @@ -740,13 +742,13 @@ __device__ __host__ GSU3 &GSU3::operator-=(const GSU3 &y } template -__device__ __host__ GSU3 &GSU3::operator*=(const floatT &y) { +HOST_DEVICE GSU3 &GSU3::operator*=(const floatT &y) { *this = *this * y; return *this; } template -__device__ __host__ GSU3 &GSU3::operator*=(const GCOMPLEX(floatT) &y) { +HOST_DEVICE GSU3 &GSU3::operator*=(const GCOMPLEX(floatT) &y) { _e00 *= y; _e01 *= y; _e02 *= y; @@ -760,13 +762,13 @@ __device__ __host__ GSU3 &GSU3::operator*=(const GCOMPLEX(floatT } template -__device__ __host__ GSU3 &GSU3::operator*=(const GSU3 &y) { +HOST_DEVICE GSU3 &GSU3::operator*=(const GSU3 &y) { *this = *this * y; return *this; } template -__device__ __host__ GSU3 &GSU3::operator/=(const floatT &y) { +HOST_DEVICE GSU3 &GSU3::operator/=(const floatT &y) { *this = *this / y; return *this; } @@ -775,7 +777,7 @@ __device__ __host__ GSU3 &GSU3::operator/=(const floatT &y) { /// tolerance for comparison. In that case please look to the compareGSU3 method. In case you are comparing with the /// zero matrix, you should use compareGSU3, as the present method seems not to work for that case. template -__device__ __host__ bool GSU3::operator==(const GSU3 &y) { +HOST_DEVICE bool GSU3::operator==(const GSU3 &y) { if (_e00 == y._e00 && _e01 == y._e01 && _e02 == y._e02 && @@ -790,7 +792,7 @@ __device__ __host__ bool GSU3::operator==(const GSU3 &y) { } template -__host__ inline std::ostream &operator<<(std::ostream &s, const GSU3 &x) { +HOST inline std::ostream &operator<<(std::ostream &s, const GSU3 &x) { return s << "\n" << x.getLink00() << x.getLink01() << x.getLink02() << "\n" << x.getLink10() << x.getLink11() << x.getLink12() << "\n" << x.getLink20() << x.getLink21() << x.getLink22() << "\n"; @@ -798,7 +800,7 @@ return s << "\n" << x.getLink00() << x.getLink01() << x.getLink02() << "\n" /// TODO: This presumably doesn't work template -__host__ inline std::istream &operator>>(std::istream &s, GSU3 &x) { +HOST inline std::istream &operator>>(std::istream &s, GSU3 &x) { return s >> x._e00.cREAL >> x._e00.cIMAG >> x._e01.cREAL >> x._e01.cIMAG >> x._e02.cREAL >> x._e02.cIMAG >> x._e10.cREAL >> x._e10.cIMAG >> x._e11.cREAL >> x._e11.cIMAG >> x._e12.cREAL >> x._e12.cIMAG >> x._e20.cREAL >> x._e20.cIMAG >> x._e21.cREAL >> x._e21.cIMAG >> x._e22.cREAL >> x._e22.cIMAG; @@ -806,7 +808,7 @@ __host__ inline std::istream &operator>>(std::istream &s, GSU3 &x) { template -__device__ __host__ void GSU3::random(uint4 *state) { +HOST_DEVICE void GSU3::random(uint4 *state) { GCOMPLEX(floatT) rnd; @@ -835,8 +837,10 @@ __device__ __host__ void GSU3::random(uint4 *state) { template -__device__ __host__ void GSU3::gauss(uint4 *state) { +HOST_DEVICE void GSU3::gauss(uint4 *state) { +#ifndef USE_CPU_ONLY if constexpr (!std::is_same::value) { +#endif floatT rand1[4], rand2[4], phi[4], radius[4], temp1[4], temp2[4]; for (int i = 0; i < 4; ++i) { @@ -862,9 +866,12 @@ __device__ __host__ void GSU3::gauss(uint4 *state) { _e20 = GCOMPLEX(floatT)(temp1[3], temp2[0]); _e21 = GCOMPLEX(floatT)(temp2[1], temp2[2]); _e22 = GCOMPLEX(floatT)(-2. / sqrt(3.0) * temp2[3], 0.0); +#ifndef USE_CPU_ONLY } - else { +#endif #ifdef __GPU_ARCH__ + else { + float rand1[4], rand2[4], phi[4], radius[4], temp1[4], temp2[4]; for (int i = 0; i < 4; ++i) { @@ -890,14 +897,16 @@ __device__ __host__ void GSU3::gauss(uint4 *state) { _e20 = GCOMPLEX(__half)(__float2half(temp1[3]), __float2half( temp2[0])); _e21 = GCOMPLEX(__half)(__float2half(temp2[1]), __float2half( temp2[2])); _e22 = GCOMPLEX(__half)(__float2half(-2. / sqrt(3.0) * temp2[3]), __float2half( 0.0)); -#endif } +#endif } // project to su3 using first two rows of link template -__device__ __host__ void GSU3::su3unitarize() { +HOST_DEVICE void GSU3::su3unitarize() { +#ifndef USE_CPU_ONLY if constexpr (!std::is_same::value) { +#endif double quadnorm, invnorm; double Cre, Cim; @@ -956,9 +965,12 @@ __device__ __host__ void GSU3::su3unitarize() { - (_e01.cREAL * _e10.cREAL - _e01.cIMAG * _e10.cIMAG)), (-(_e00.cIMAG * _e11.cREAL + _e00.cREAL * _e11.cIMAG) + (_e01.cIMAG * _e10.cREAL + _e01.cREAL * _e10.cIMAG))); +#ifndef USE_CPU_ONLY } +#endif +#ifdef __GPU_ARCH__ else { - #ifdef __GPU_ARCH__ + double quadnorm, invnorm; double Cre, Cim; @@ -1017,13 +1029,12 @@ __device__ __host__ void GSU3::su3unitarize() { - (_e01.cREAL * _e10.cREAL - _e01.cIMAG * _e10.cIMAG)), (-(_e00.cIMAG * _e11.cREAL + _e00.cREAL * _e11.cIMAG) + (_e01.cIMAG * _e10.cREAL + _e01.cREAL * _e10.cIMAG))); - #endif } - +#endif } template -__device__ __host__ GCOMPLEX(floatT) det(const GSU3 &x) { +HOST_DEVICE GCOMPLEX(floatT) det(const GSU3 &x) { GCOMPLEX(floatT) res; @@ -1035,12 +1046,12 @@ __device__ __host__ GCOMPLEX(floatT) det(const GSU3 &x) { } template -__device__ __host__ floatT realdet(const GSU3 &x) { +HOST_DEVICE floatT realdet(const GSU3 &x) { return det(x).cREAL; } template -__device__ __host__ floatT infnorm(const GSU3 &x) { +HOST_DEVICE floatT infnorm(const GSU3 &x) { floatT res = x._e00.cREAL * x._e00.cREAL; res = x._e00.cIMAG * x._e00.cIMAG + res; res = x._e01.cREAL * x._e01.cREAL + res; @@ -1073,7 +1084,7 @@ __device__ __host__ floatT infnorm(const GSU3 &x) { // traceless anti-hermitian of link template -__device__ __host__ void GSU3::TA() { +HOST_DEVICE void GSU3::TA() { GSU3 tmp; tmp._e00 = GCOMPLEX(floatT)(0, 0.6666666666666666 * _e00.cIMAG - 0.3333333333333333 * (_e11.cIMAG + _e22.cIMAG)); @@ -1091,19 +1102,19 @@ __device__ __host__ void GSU3::TA() { // real part of trace of link template -__device__ __host__ floatT tr_d(const GSU3 &x) { +HOST_DEVICE floatT tr_d(const GSU3 &x) { return floatT(x._e00.cREAL + x._e11.cREAL + x._e22.cREAL); } // imaginary part of trace of link template -__device__ __host__ floatT tr_i(const GSU3 &x) { +HOST_DEVICE floatT tr_i(const GSU3 &x) { return floatT(x._e00.cIMAG + x._e11.cIMAG + x._e22.cIMAG); } // real part of trace of link*link template -__device__ __host__ floatT tr_d(const GSU3 &x, const GSU3 &y) { +HOST_DEVICE floatT tr_d(const GSU3 &x, const GSU3 &y) { floatT res; res = (x._e00 * y._e00).cREAL + (x._e01 * y._e10).cREAL + (x._e02 * y._e20).cREAL + (x._e10 * y._e01).cREAL + (x._e11 * y._e11).cREAL + (x._e12 * y._e21).cREAL @@ -1114,13 +1125,13 @@ __device__ __host__ floatT tr_d(const GSU3 &x, const GSU3 &y) { // trace of link template -__device__ __host__ GCOMPLEX(floatT) tr_c(const GSU3 &x) { +HOST_DEVICE GCOMPLEX(floatT) tr_c(const GSU3 &x) { return GCOMPLEX(floatT)(x._e00 + x._e11 + x._e22); } // trace of link*link template -__device__ __host__ GCOMPLEX(floatT) tr_c(const GSU3 &x, const GSU3 &y) { +HOST_DEVICE GCOMPLEX(floatT) tr_c(const GSU3 &x, const GSU3 &y) { GCOMPLEX(floatT) res; @@ -1133,7 +1144,7 @@ __device__ __host__ GCOMPLEX(floatT) tr_c(const GSU3 &x, const GSU3 -__device__ __host__ GSU3 dagger(const GSU3 &x) { +HOST_DEVICE GSU3 dagger(const GSU3 &x) { GSU3 tmp; tmp._e00 = conj(x._e00); @@ -1151,7 +1162,7 @@ __device__ __host__ GSU3 dagger(const GSU3 &x) { // exp( link ) template -__device__ __host__ GSU3 su3_exp(GSU3 u) { +HOST_DEVICE GSU3 su3_exp(GSU3 u) { GSU3 res; res = gsu3_one() @@ -1166,7 +1177,7 @@ __device__ __host__ GSU3 su3_exp(GSU3 u) { // tensor product of two cvect3 template -__device__ __host__ GSU3 tensor_prod(const gVect3 &x, const gVect3 &y) { +HOST_DEVICE GSU3 tensor_prod(const gVect3 &x, const gVect3 &y) { GSU3 res; res._e00 = x._v0 * y._v0; @@ -1183,7 +1194,7 @@ __device__ __host__ GSU3 tensor_prod(const gVect3 &x, const gVec } template -__device__ __host__ inline bool compareGSU3(GSU3 a, GSU3 b, floatT tol) { +HOST_DEVICE inline bool compareGSU3(GSU3 a, GSU3 b, floatT tol) { for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) { diff --git a/src/base/math/gvect3.h b/src/base/math/gvect3.h index fa987fcc..b5e2e215 100644 --- a/src/base/math/gvect3.h +++ b/src/base/math/gvect3.h @@ -19,29 +19,29 @@ template class GSU3; template class gVect3; template class cVect3; template class gVect3array; -template __host__ std::ostream & operator<<(std::ostream &, const gVect3 &); -template __host__ std::istream & operator>>(std::istream &, gVect3 &); -template __device__ __host__ GCOMPLEX(floatT) operator*(const gVect3 &,const gVect3 &); -template __device__ __host__ GCOMPLEX(floatT) complex_product(const gVect3 &,const gVect3 &); -template __device__ __host__ GCOMPLEX(floatT) complex_product_add(const gVect3 &,const gVect3 &, const GCOMPLEX(floatT) &); - - -template __device__ __host__ gVect3 operator+(const gVect3 &,const gVect3 &); -template __device__ __host__ gVect3 operator-(const gVect3 &,const gVect3 &); -template __device__ __host__ gVect3 operator*(const floatT &,const gVect3 &); -template __device__ __host__ gVect3 operator*(const GCOMPLEX(floatT) &,const gVect3 &); -template __device__ __host__ gVect3 operator*(const gVect3 &,const floatT &); -template __device__ __host__ gVect3 operator*(const gVect3 &,const GCOMPLEX(floatT) &); -template __device__ __host__ gVect3 conj(const gVect3 &); -template __device__ __host__ floatT norm2(const gVect3 &); -template __device__ __host__ GCOMPLEX(floatT) dot_prod(const gVect3 &,const gVect3 &); -template __device__ __host__ floatT re_dot_prod(const gVect3 &,const gVect3 &); -template __device__ __host__ gVect3 operator*(const GSU3 &,const gVect3 &); -template __device__ __host__ GSU3 tensor_prod(const gVect3 &,const gVect3 &); -template __device__ __host__ inline floatT minVal(); +template HOST std::ostream & operator<<(std::ostream &, const gVect3 &); +template HOST std::istream & operator>>(std::istream &, gVect3 &); +template HOST_DEVICE GCOMPLEX(floatT) operator*(const gVect3 &,const gVect3 &); +template HOST_DEVICE GCOMPLEX(floatT) complex_product(const gVect3 &,const gVect3 &); +template HOST_DEVICE GCOMPLEX(floatT) complex_product_add(const gVect3 &,const gVect3 &, const GCOMPLEX(floatT) &); + + +template HOST_DEVICE gVect3 operator+(const gVect3 &,const gVect3 &); +template HOST_DEVICE gVect3 operator-(const gVect3 &,const gVect3 &); +template HOST_DEVICE gVect3 operator*(const floatT &,const gVect3 &); +template HOST_DEVICE gVect3 operator*(const GCOMPLEX(floatT) &,const gVect3 &); +template HOST_DEVICE gVect3 operator*(const gVect3 &,const floatT &); +template HOST_DEVICE gVect3 operator*(const gVect3 &,const GCOMPLEX(floatT) &); +template HOST_DEVICE gVect3 conj(const gVect3 &); +template HOST_DEVICE floatT norm2(const gVect3 &); +template HOST_DEVICE GCOMPLEX(floatT) dot_prod(const gVect3 &,const gVect3 &); +template HOST_DEVICE floatT re_dot_prod(const gVect3 &,const gVect3 &); +template HOST_DEVICE gVect3 operator*(const GSU3 &,const gVect3 &); +template HOST_DEVICE GSU3 tensor_prod(const gVect3 &,const gVect3 &); +template HOST_DEVICE inline floatT minVal(); template -__device__ __host__ inline floatT get_rand(uint4* state); +HOST_DEVICE inline floatT get_rand(uint4* state); template class gVect3 @@ -52,45 +52,45 @@ class gVect3 public: - __host__ __device__ gVect3() {}; - __host__ __device__ gVect3(GCOMPLEX(floatT) v0) : _v0(v0), _v1(v0), _v2(v0) {}; - __host__ __device__ gVect3(floatT v0) : _v0(v0), _v1(v0), _v2(v0) {}; - __host__ __device__ gVect3(GCOMPLEX(floatT) v0, GCOMPLEX(floatT) v1, GCOMPLEX(floatT) v2) : _v0(v0), _v1(v1), _v2(v2) {}; + HOST_DEVICE gVect3() {}; + HOST_DEVICE gVect3(GCOMPLEX(floatT) v0) : _v0(v0), _v1(v0), _v2(v0) {}; + HOST_DEVICE gVect3(floatT v0) : _v0(v0), _v1(v0), _v2(v0) {}; + HOST_DEVICE gVect3(GCOMPLEX(floatT) v0, GCOMPLEX(floatT) v1, GCOMPLEX(floatT) v2) : _v0(v0), _v1(v1), _v2(v2) {}; #if (!defined __GPUCC__) - __host__ friend std::ostream &operator << <> (std::ostream &, const gVect3 &); + HOST friend std::ostream &operator << <> (std::ostream &, const gVect3 &); #endif - __host__ friend std::istream &operator >> <> (std::istream &, gVect3 &); + HOST friend std::istream &operator >> <> (std::istream &, gVect3 &); friend class gVect3array; friend class gVect3array; // vector operations - __device__ __host__ gVect3 &operator =(const gVect3 &); - __device__ __host__ gVect3 &operator-=(const gVect3 &); - __device__ __host__ gVect3 &operator+=(const gVect3 &); - __device__ __host__ gVect3 &operator*=(const floatT &); - __device__ __host__ gVect3 &operator*=(const GCOMPLEX(floatT) &); - __device__ __host__ friend GCOMPLEX(floatT) operator* <> (const gVect3 &,const gVect3 &); - __device__ __host__ friend GCOMPLEX(floatT) complex_product <> (const gVect3 &,const gVect3 &); - __device__ __host__ friend GCOMPLEX(floatT) complex_product_add <> (const gVect3 &,const gVect3 &, const GCOMPLEX(floatT) & ); - __device__ __host__ friend gVect3 operator+ <> (const gVect3 &,const gVect3 &); - __device__ __host__ friend gVect3 operator- <> (const gVect3 &,const gVect3 &); - __device__ __host__ friend gVect3 operator* <> (const floatT &,const gVect3 &); - __device__ __host__ friend gVect3 operator* <> (const GCOMPLEX(floatT) &,const gVect3 &); - __device__ __host__ friend gVect3 operator* <> (const gVect3 &,const floatT &); - __device__ __host__ friend gVect3 operator* <> (const gVect3 &,const GCOMPLEX(floatT) &); - - __device__ __host__ friend gVect3 conj <> (const gVect3 &); // complex conjugate - __device__ __host__ friend floatT norm2 <> (const gVect3 &); // norm2 - __device__ __host__ friend GCOMPLEX(floatT) dot_prod <> (const gVect3&, const gVect3&); // true complex dot product - __device__ __host__ friend floatT re_dot_prod <> (const gVect3 &,const gVect3 &); // real part of dot product + HOST_DEVICE gVect3 &operator =(const gVect3 &); + HOST_DEVICE gVect3 &operator-=(const gVect3 &); + HOST_DEVICE gVect3 &operator+=(const gVect3 &); + HOST_DEVICE gVect3 &operator*=(const floatT &); + HOST_DEVICE gVect3 &operator*=(const GCOMPLEX(floatT) &); + HOST_DEVICE friend GCOMPLEX(floatT) operator* <> (const gVect3 &,const gVect3 &); + HOST_DEVICE friend GCOMPLEX(floatT) complex_product <> (const gVect3 &,const gVect3 &); + HOST_DEVICE friend GCOMPLEX(floatT) complex_product_add <> (const gVect3 &,const gVect3 &, const GCOMPLEX(floatT) & ); + HOST_DEVICE friend gVect3 operator+ <> (const gVect3 &,const gVect3 &); + HOST_DEVICE friend gVect3 operator- <> (const gVect3 &,const gVect3 &); + HOST_DEVICE friend gVect3 operator* <> (const floatT &,const gVect3 &); + HOST_DEVICE friend gVect3 operator* <> (const GCOMPLEX(floatT) &,const gVect3 &); + HOST_DEVICE friend gVect3 operator* <> (const gVect3 &,const floatT &); + HOST_DEVICE friend gVect3 operator* <> (const gVect3 &,const GCOMPLEX(floatT) &); + + HOST_DEVICE friend gVect3 conj <> (const gVect3 &); // complex conjugate + HOST_DEVICE friend floatT norm2 <> (const gVect3 &); // norm2 + HOST_DEVICE friend GCOMPLEX(floatT) dot_prod <> (const gVect3&, const gVect3&); // true complex dot product + HOST_DEVICE friend floatT re_dot_prod <> (const gVect3 &,const gVect3 &); // real part of dot product template - __device__ __host__ void random( rndstateT * const); // set gvect3 randomly - __device__ __host__ void gauss( uint4 * state ) + HOST_DEVICE void random( rndstateT * const); // set gvect3 randomly + HOST_DEVICE void gauss( uint4 * state ) { -#ifndef USE_HIP_AMD +#if ! defined(USE_HIP_AMD) && ! defined(USE_CPU_ONLY) if constexpr (!std::is_same::value) { #endif floatT radius0,radius1,radius2,phi0,phi1,phi2; @@ -112,10 +112,11 @@ class gVect3 _v0 = GCOMPLEX(floatT)(radius0 * cos(phi0), radius0 * sin(phi0)); _v1 = GCOMPLEX(floatT)(radius1 * cos(phi1), radius1 * sin(phi1)); _v2 = GCOMPLEX(floatT)(radius2 * cos(phi2), radius2 * sin(phi2)); -#ifndef USE_HIP_AMD +#if ! defined(USE_HIP_AMD) && ! defined(USE_CPU_ONLY) } +#ifdef __GPU_ARCH__ else { - #ifdef __GPU_ARCH__ + float radius0,radius1,radius2,phi0,phi1,phi2; phi0 = 2.0*M_PI * get_rand(state); phi1 = 2.0*M_PI * get_rand(state); @@ -134,65 +135,65 @@ class gVect3 _v0 = GCOMPLEX(__half)(__float2half(radius0 * cos(phi0)), __float2half(radius0 * sin(phi0))); _v1 = GCOMPLEX(__half)(__float2half(radius1 * cos(phi1)), __float2half(radius1 * sin(phi1))); _v2 = GCOMPLEX(__half)(__float2half(radius2 * cos(phi2)), __float2half(radius2 * sin(phi2))); - #endif } +#endif #endif }; // cast operations single <-> double precision template - __device__ __host__ operator gVect3 () const { + HOST_DEVICE operator gVect3 () const { return gVect3( GCOMPLEX(T)(_v0.cREAL, _v0.cIMAG), GCOMPLEX(T)(_v1.cREAL, _v1.cIMAG), GCOMPLEX(T)(_v2.cREAL, _v2.cIMAG) ); } - __device__ __host__ friend gVect3 operator* <> (const GSU3 &,const gVect3 &); // gsu3 * gvect3 multiplication - __device__ __host__ friend GSU3 tensor_prod <> (const gVect3 &,const gVect3 &); // tensor product of two gvect3 + HOST_DEVICE friend gVect3 operator* <> (const GSU3 &,const gVect3 &); // gsu3 * gvect3 multiplication + HOST_DEVICE friend GSU3 tensor_prod <> (const gVect3 &,const gVect3 &); // tensor product of two gvect3 - __device__ __host__ inline GCOMPLEX(floatT) getElement0() const { + HOST_DEVICE inline GCOMPLEX(floatT) getElement0() const { return _v0; }; - __device__ __host__ inline GCOMPLEX(floatT) getElement1()const { + HOST_DEVICE inline GCOMPLEX(floatT) getElement1()const { return _v1; }; - __device__ __host__ inline GCOMPLEX(floatT) getElement2() const { + HOST_DEVICE inline GCOMPLEX(floatT) getElement2() const { return _v2; }; - __device__ __host__ inline void addtoElement0(const GCOMPLEX(floatT) a){ + HOST_DEVICE inline void addtoElement0(const GCOMPLEX(floatT) a){ _v0 += a; } - __device__ __host__ inline void addtoElement1(const GCOMPLEX(floatT) a){ + HOST_DEVICE inline void addtoElement1(const GCOMPLEX(floatT) a){ _v1 += a; } - __device__ __host__ inline void addtoElement2(const GCOMPLEX(floatT) a){ + HOST_DEVICE inline void addtoElement2(const GCOMPLEX(floatT) a){ _v2 += a; } - __device__ __host__ inline void setElement0(const GCOMPLEX(floatT)& a){ + HOST_DEVICE inline void setElement0(const GCOMPLEX(floatT)& a){ _v0 = a; } - __device__ __host__ inline void setElement1(const GCOMPLEX(floatT)& a){ + HOST_DEVICE inline void setElement1(const GCOMPLEX(floatT)& a){ _v1 = a; } - __device__ __host__ inline void setElement2(const GCOMPLEX(floatT)& a){ + HOST_DEVICE inline void setElement2(const GCOMPLEX(floatT)& a){ _v2 = a; } - __device__ __host__ inline void subfromElement0(const GCOMPLEX(floatT) a){ + HOST_DEVICE inline void subfromElement0(const GCOMPLEX(floatT) a){ _v0 -= a; } - __device__ __host__ inline void subfromElement1(const GCOMPLEX(floatT) a){ + HOST_DEVICE inline void subfromElement1(const GCOMPLEX(floatT) a){ _v1 -= a; } - __device__ __host__ inline void subfromElement2(const GCOMPLEX(floatT) a){ + HOST_DEVICE inline void subfromElement2(const GCOMPLEX(floatT) a){ _v2 -= a; } - __device__ __host__ inline GCOMPLEX(floatT)& operator() (int i) { + HOST_DEVICE inline GCOMPLEX(floatT)& operator() (int i) { switch (i) { case 0: return _v0; @@ -206,12 +207,12 @@ class gVect3 } - __host__ __device__ gVect3 getAccessor() const{ + HOST_DEVICE gVect3 getAccessor() const{ return *this; } template - __host__ __device__ gVect3 operator()(const Index) const { + HOST_DEVICE gVect3 operator()(const Index) const { return *this; } }; @@ -219,7 +220,7 @@ class gVect3 // gvect3 = (1,0,0) or (0,1,0) or (0,0,1) template -__device__ __host__ inline gVect3 gvect3_unity(const int& i) +HOST_DEVICE inline gVect3 gvect3_unity(const int& i) { switch ( i ) { @@ -231,7 +232,7 @@ __device__ __host__ inline gVect3 gvect3_unity(const int& i) // default value return gVect3 (1, 0, 0); } -#ifndef USE_HIP_AMD +#if ! defined(USE_HIP_AMD) && ! defined(USE_CPU_ONLY) template <> __device__ inline gVect3<__half> gvect3_unity(const int& i) { @@ -250,7 +251,7 @@ return gVect3<__half> (__float2half(1), __float2half(0), __float2half(0)); #endif // cvect3 = (1,1,1) template -__device__ __host__ inline gVect3 gvect3_one() +HOST_DEVICE inline gVect3 gvect3_one() { return gVect3 (1, 1, 1); } @@ -259,11 +260,11 @@ __device__ __host__ inline gVect3 gvect3_one() // cvect3 = (0,0,0) template -__device__ __host__ inline gVect3 gvect3_zero() +HOST_DEVICE inline gVect3 gvect3_zero() { return gVect3 (0, 0, 0); } -#ifndef USE_HIP_AMD +#if ! defined(USE_HIP_AMD) && ! defined(USE_CPU_ONLY) template<> __device__ inline gVect3<__half> gvect3_zero() { @@ -271,7 +272,7 @@ __device__ inline gVect3<__half> gvect3_zero() } #endif template -__device__ __host__ gVect3 &gVect3::operator=(const gVect3 &y) +HOST_DEVICE gVect3 &gVect3::operator=(const gVect3 &y) { _v0 = y._v0; _v1 = y._v1; @@ -280,7 +281,7 @@ __device__ __host__ gVect3 &gVect3::operator=(const gVect3 -__device__ __host__ gVect3 &gVect3::operator-=(const gVect3 &y) +HOST_DEVICE gVect3 &gVect3::operator-=(const gVect3 &y) { _v0-= y._v0; _v1-= y._v1; @@ -289,7 +290,7 @@ __device__ __host__ gVect3 &gVect3::operator-=(const gVect3 -__device__ __host__ gVect3 &gVect3::operator+=(const gVect3 &y) +HOST_DEVICE gVect3 &gVect3::operator+=(const gVect3 &y) { _v0+= y._v0; _v1+= y._v1; @@ -298,7 +299,7 @@ __device__ __host__ gVect3 &gVect3::operator+=(const gVect3 -__device__ __host__ gVect3 &gVect3::operator*=(const floatT &y) +HOST_DEVICE gVect3 &gVect3::operator*=(const floatT &y) { _v0*= y; _v1*= y; @@ -307,7 +308,7 @@ __device__ __host__ gVect3 &gVect3::operator*=(const floatT &y) } template -__device__ __host__ gVect3 &gVect3::operator*=(const GCOMPLEX(floatT) &y) +HOST_DEVICE gVect3 &gVect3::operator*=(const GCOMPLEX(floatT) &y) { _v0*= y; _v1*= y; @@ -316,7 +317,7 @@ __device__ __host__ gVect3 &gVect3::operator*=(const GCOMPLEX(fl } template -__device__ __host__ GCOMPLEX(floatT) operator*(const gVect3 &x,const gVect3 &y) +HOST_DEVICE GCOMPLEX(floatT) operator*(const gVect3 &x,const gVect3 &y) { GCOMPLEX(floatT) res = conj(x._v0) * y._v0; res += conj(x._v1) * y._v1; @@ -325,7 +326,7 @@ __device__ __host__ GCOMPLEX(floatT) operator*(const gVect3 &x,const gVe } template -__device__ __host__ GCOMPLEX(floatT) complex_product(const gVect3 &x,const gVect3 &y) +HOST_DEVICE GCOMPLEX(floatT) complex_product(const gVect3 &x,const gVect3 &y) { // GCOMPLEX(floatT) res = x._v0 *(y._v0); // res += x._v1 * (y._v1); @@ -339,7 +340,7 @@ __device__ __host__ GCOMPLEX(floatT) complex_product(const gVect3 &x,con } template -__device__ __host__ GCOMPLEX(floatT) complex_product_add(const gVect3 &x,const gVect3 &y, const GCOMPLEX(floatT) &d) +HOST_DEVICE GCOMPLEX(floatT) complex_product_add(const gVect3 &x,const gVect3 &y, const GCOMPLEX(floatT) &d) { //GCOMPLEX(floatT) res = x._v0 *(y._v0); //res += x._v1 * (y._v1); @@ -351,7 +352,7 @@ __device__ __host__ GCOMPLEX(floatT) complex_product_add(const gVect3 &x } template -__device__ __host__ gVect3 operator+(const gVect3 &x,const gVect3 &y) +HOST_DEVICE gVect3 operator+(const gVect3 &x,const gVect3 &y) { gVect3 z; z._v0 = x._v0 + y._v0; @@ -361,7 +362,7 @@ __device__ __host__ gVect3 operator+(const gVect3 &x,const gVect } template -__device__ __host__ gVect3 operator-(const gVect3 &x,const gVect3 &y) +HOST_DEVICE gVect3 operator-(const gVect3 &x,const gVect3 &y) { gVect3 z; z._v0 = x._v0 - y._v0; @@ -371,7 +372,7 @@ __device__ __host__ gVect3 operator-(const gVect3 &x,const gVect } template -__device__ __host__ gVect3 operator*(const GCOMPLEX(floatT)& x,const gVect3& y) +HOST_DEVICE gVect3 operator*(const GCOMPLEX(floatT)& x,const gVect3& y) { gVect3 z; z._v0 = x * y._v0; @@ -381,7 +382,7 @@ __device__ __host__ gVect3 operator*(const GCOMPLEX(floatT)& x,const gVe } template -__device__ __host__ gVect3 operator*(const floatT & x,const gVect3& y) +HOST_DEVICE gVect3 operator*(const floatT & x,const gVect3& y) { gVect3 z; z._v0 = x * y._v0; @@ -391,7 +392,7 @@ __device__ __host__ gVect3 operator*(const floatT & x,const gVect3 -__device__ __host__ gVect3 operator*(const gVect3& x,const GCOMPLEX(floatT)& y) +HOST_DEVICE gVect3 operator*(const gVect3& x,const GCOMPLEX(floatT)& y) { gVect3 z; z._v0 = x._v0 * y; @@ -401,7 +402,7 @@ __device__ __host__ gVect3 operator*(const gVect3& x,const GCOMP } template -__device__ __host__ gVect3 operator*(const gVect3& x,const floatT & y) +HOST_DEVICE gVect3 operator*(const gVect3& x,const floatT & y) { gVect3 z; z._v0 = x._v0 * y; @@ -412,7 +413,7 @@ __device__ __host__ gVect3 operator*(const gVect3& x,const float //! complex dot product x*y = sum_i(v_i conj(w_i)) template -__device__ __host__ GCOMPLEX(floatT) dot_prod(const gVect3 &x,const gVect3 &y) +HOST_DEVICE GCOMPLEX(floatT) dot_prod(const gVect3 &x,const gVect3 &y) { floatT real = x._v0.cREAL*y._v0.cREAL + x._v0.cIMAG*y._v0.cIMAG; real += x._v1.cREAL*y._v1.cREAL + x._v1.cIMAG*y._v1.cIMAG; @@ -425,7 +426,7 @@ __device__ __host__ GCOMPLEX(floatT) dot_prod(const gVect3 &x,const gVec //! real part of dot product (no conjugation for y) template -__device__ __host__ floatT re_dot_prod(const gVect3 &x,const gVect3 &y) +HOST_DEVICE floatT re_dot_prod(const gVect3 &x,const gVect3 &y) { floatT res = x._v0.cREAL*y._v0.cREAL + x._v0.cIMAG*y._v0.cIMAG; res += x._v1.cREAL*y._v1.cREAL + x._v1.cIMAG*y._v1.cIMAG; @@ -435,7 +436,7 @@ __device__ __host__ floatT re_dot_prod(const gVect3 &x,const gVect3 -__device__ __host__ floatT norm2(const gVect3 &x) +HOST_DEVICE floatT norm2(const gVect3 &x) { floatT res = x._v0.cREAL*x._v0.cREAL + x._v0.cIMAG*x._v0.cIMAG; res += x._v1.cREAL*x._v1.cREAL + x._v1.cIMAG*x._v1.cIMAG; @@ -445,7 +446,7 @@ __device__ __host__ floatT norm2(const gVect3 &x) // complex conjugate template -__device__ __host__ gVect3 conj(const gVect3 &x) +HOST_DEVICE gVect3 conj(const gVect3 &x) { gVect3 z; z._v0 = conj(x._v0); @@ -458,13 +459,13 @@ __device__ __host__ gVect3 conj(const gVect3 &x) #ifdef __GPUCC__ template -__host__ std::ostream &operator << (std::ostream &s, const gVect3 &x) +HOST std::ostream &operator << (std::ostream &s, const gVect3 &x) { return s << x.getElement0() << x.getElement1() << x.getElement2(); } template -__host__ std::istream &operator >> (std::istream &s, gVect3 &x) +HOST std::istream &operator >> (std::istream &s, gVect3 &x) { return s >> x._v0.cREAL >> x._v0.cIMAG >> x._v1.cREAL >> x._v1.cIMAG >> x._v2.cREAL >> x._v2.cIMAG; } diff --git a/src/base/math/gvect3array.h b/src/base/math/gvect3array.h index 8f655b39..0a619eff 100644 --- a/src/base/math/gvect3array.h +++ b/src/base/math/gvect3array.h @@ -17,13 +17,13 @@ struct gVect3arrayAcc : public GeneralAccessor { : GeneralAccessor(elements) { } - __host__ __device__ explicit gVect3arrayAcc(GCOMPLEX(floatT) *elementsBase, size_t object_count) + HOST_DEVICE explicit gVect3arrayAcc(GCOMPLEX(floatT) *elementsBase, size_t object_count) : GeneralAccessor(elementsBase, object_count) { } explicit gVect3arrayAcc() : GeneralAccessor() { } template - __host__ __device__ inline gVect3 getElement(const gSite &site) const { + HOST_DEVICE inline gVect3 getElement(const gSite &site) const { return static_cast>(gVect3( this->template getElementEntry<0>(site.isiteFull), this->template getElementEntry<1>(site.isiteFull), @@ -31,14 +31,14 @@ struct gVect3arrayAcc : public GeneralAccessor { } template - __host__ __device__ inline void setElement(const gSite &site, const gVect3 &vec) { + HOST_DEVICE inline void setElement(const gSite &site, const gVect3 &vec) { this->template setElementEntry<0>(site.isiteFull, vec.getElement0()); this->template setElementEntry<1>(site.isiteFull, vec.getElement1()); this->template setElementEntry<2>(site.isiteFull, vec.getElement2()); } template - __host__ __device__ inline gVect3 getElement(const gSiteStack &site) const { + HOST_DEVICE inline gVect3 getElement(const gSiteStack &site) const { gVect3 ret( this->template getElementEntry<0>(site.isiteStackFull), this->template getElementEntry<1>(site.isiteStackFull), @@ -47,13 +47,13 @@ struct gVect3arrayAcc : public GeneralAccessor { } template - __host__ __device__ inline void setElement(const gSiteStack &site, const gVect3 &vec) { + HOST_DEVICE inline void setElement(const gSiteStack &site, const gVect3 &vec) { this->template setElementEntry<0>(site.isiteStackFull, vec.getElement0()); this->template setElementEntry<1>(site.isiteStackFull, vec.getElement1()); this->template setElementEntry<2>(site.isiteStackFull, vec.getElement2()); } - __host__ __device__ inline void setEntriesComm(gVect3arrayAcc &src_acc, + HOST_DEVICE inline void setEntriesComm(gVect3arrayAcc &src_acc, size_t setIndex, size_t getIndex) { this->template setElementEntry<0>(setIndex, src_acc.template getElementEntry<0>(getIndex)); this->template setElementEntry<1>(setIndex, src_acc.template getElementEntry<1>(getIndex)); @@ -61,20 +61,20 @@ struct gVect3arrayAcc : public GeneralAccessor { } template - __host__ __device__ inline size_t getIndexComm(size_t isiteFull, size_t stack) const { + HOST_DEVICE inline size_t getIndexComm(size_t isiteFull, size_t stack) const { gSiteStack site = GIndexer::getSiteStackFull(isiteFull, stack); return site.isiteStackFull; } template - __host__ __device__ inline gVect3 getElementComm(size_t isiteFull, size_t stack) const { + HOST_DEVICE inline gVect3 getElementComm(size_t isiteFull, size_t stack) const { gSiteStack site = GIndexer::getSiteStackFull(isiteFull, stack); return getElement(site); } template - __host__ __device__ inline void setElementComm(size_t isiteFull, size_t stack, const gVect3 &vec) { + HOST_DEVICE inline void setElementComm(size_t isiteFull, size_t stack, const gVect3 &vec) { gSiteStack site; site.isiteFull = isiteFull; site.isiteStackFull = isiteFull; @@ -82,12 +82,12 @@ struct gVect3arrayAcc : public GeneralAccessor { } template - __host__ __device__ inline gVect3 operator()(const gSite &site) const { + HOST_DEVICE inline gVect3 operator()(const gSite &site) const { return this->getElement(site); }; template - __host__ __device__ inline gVect3 operator()(const gSiteStack &site) const { + HOST_DEVICE inline gVect3 operator()(const gSiteStack &site) const { return this->getElement(site); }; }; diff --git a/src/base/math/matrix4x4.h b/src/base/math/matrix4x4.h index 0bd54213..47fbf5e6 100644 --- a/src/base/math/matrix4x4.h +++ b/src/base/math/matrix4x4.h @@ -18,14 +18,14 @@ struct Matrix4x4Sym { constexpr Matrix4x4Sym(const Matrix4x4Sym&) = default; - __device__ __host__ Matrix4x4Sym(floatT a) : elems{a, a, a, a, a, a, a, a, a, a} {} - __device__ __host__ Matrix4x4Sym() : elems{0, 0, 0, 0, 0, 0, 0, 0, 0, 0} {} + HOST_DEVICE Matrix4x4Sym(floatT a) : elems{a, a, a, a, a, a, a, a, a, a} {} + HOST_DEVICE Matrix4x4Sym() : elems{0, 0, 0, 0, 0, 0, 0, 0, 0, 0} {} - __device__ __host__ Matrix4x4Sym(floatT e00, floatT e11, floatT e22, floatT e33, floatT e01, floatT e02, floatT e03, floatT e12, + HOST_DEVICE Matrix4x4Sym(floatT e00, floatT e11, floatT e22, floatT e33, floatT e01, floatT e02, floatT e03, floatT e12, floatT e13, floatT e23) : elems{e00, e11, e22, e33, e01, e02, e03, e12, e13, e23} {} - __device__ __host__ inline floatT operator()(int mu, int nu) { + HOST_DEVICE inline floatT operator()(int mu, int nu) { if (mu == 0 && nu == 0) return elems[entry::e00]; if (mu == 1 && nu == 1) return elems[entry::e11]; if (mu == 2 && nu == 2) return elems[entry::e22]; @@ -47,7 +47,7 @@ struct Matrix4x4Sym { return 0; } - __device__ __host__ inline void operator()(int mu, int nu, floatT value) { + HOST_DEVICE inline void operator()(int mu, int nu, floatT value) { if (mu == 0 && nu == 0) elems[entry::e00] = value; if (mu == 1 && nu == 1) elems[entry::e11] = value; if (mu == 2 && nu == 2) elems[entry::e22] = value; @@ -68,21 +68,21 @@ struct Matrix4x4Sym { if (nu == 2 && mu == 3) elems[entry::e23] = value; } - /* __device__ __host__ inline Matrix4x4Sym& operator=(const floatT &y) + /* HOST_DEVICE inline Matrix4x4Sym& operator=(const floatT &y) { for(int i = 0; i<10;i++){ elems[i]=y; } return *this; }*/ - __device__ __host__ inline Matrix4x4Sym& operator=(const Matrix4x4Sym &y) + HOST_DEVICE inline Matrix4x4Sym& operator=(const Matrix4x4Sym &y) { for(int i = 0; i<10;i++){ elems[i]=y.elems[i]; } return *this; } - __device__ __host__ inline Matrix4x4Sym& operator+=(const Matrix4x4Sym &y) + HOST_DEVICE inline Matrix4x4Sym& operator+=(const Matrix4x4Sym &y) { for(int i = 0; i<10;i++){ @@ -91,7 +91,7 @@ struct Matrix4x4Sym { return *this; } - __device__ __host__ inline Matrix4x4Sym& operator/=(floatT y) + HOST_DEVICE inline Matrix4x4Sym& operator/=(floatT y) { for(int i = 0; i<10;i++){ elems[i]/=y; @@ -99,7 +99,7 @@ struct Matrix4x4Sym { return *this; } - __device__ __host__ inline Matrix4x4Sym& operator*=(floatT y) + HOST_DEVICE inline Matrix4x4Sym& operator*=(floatT y) { for(int i = 0; i<10;i++){ elems[i]*=y; @@ -111,7 +111,7 @@ struct Matrix4x4Sym { template -__device__ __host__ inline Matrix4x4Sym operator+(const Matrix4x4Sym &x, const Matrix4x4Sym &y) { +HOST_DEVICE inline Matrix4x4Sym operator+(const Matrix4x4Sym &x, const Matrix4x4Sym &y) { return Matrix4x4Sym(x.elems[0]+ y.elems[0], x.elems[1]+y.elems[1], x.elems[2]+y.elems[2], x.elems[3]+y.elems[3], x.elems[4]+y.elems[4], x.elems[5]+y.elems[5], x.elems[6]+y.elems[6], x.elems[7]+y.elems[7], x.elems[8]+y.elems[8], x.elems[9]+y.elems[9]); diff --git a/src/base/math/operators.h b/src/base/math/operators.h index d7e32328..11d96f73 100644 --- a/src/base/math/operators.h +++ b/src/base/math/operators.h @@ -7,6 +7,7 @@ #include "gvect3.h" #include "../indexer/BulkIndexer.h" + /*! Using the syntax below stuff like this is possible: * Spinor a, b, c, d * Spinor a = b*c + d; @@ -22,13 +23,16 @@ enum Operation { template class custom_is_scalar{ public: static constexpr bool value = std::is_scalar::value; }; +#ifndef USE_CPU_ONLY template <> class custom_is_scalar<__half> {public: static constexpr bool value = true; }; - +#endif template class custom_is_class{ public: static constexpr bool value = std::is_class::value; }; +#ifndef USE_CPU_ONLY template <> class custom_is_class<__half>{ public: static constexpr bool value = false; }; +#endif template - inline __host__ __device__ auto operator()(const Index i) const + inline HOST_DEVICE auto operator()(const Index i) const { - //inline __host__ __device__ auto operator()(const Index i) const { + //inline HOST_DEVICE auto operator()(const Index i) const { auto rhs = _rhs(i); auto lhs = _lhs(i); return lhs + rhs; @@ -138,7 +142,7 @@ struct GeneralOperator - inline __host__ __device__ auto operator()(const Index i) const + inline HOST_DEVICE auto operator()(const Index i) const { auto lhs = _lhs(i); return lhs + _rhs; @@ -170,7 +174,7 @@ struct GeneralOperator - inline __host__ __device__ auto operator()(const Index i) const + inline HOST_DEVICE auto operator()(const Index i) const { auto rhs = _rhs(i); return _lhs + rhs; @@ -202,7 +206,7 @@ struct GeneralOperator - inline __host__ __device__ auto operator()(const Index i) const + inline HOST_DEVICE auto operator()(const Index i) const { auto rhs = _rhs(i); auto lhs = _lhs(i); @@ -235,7 +239,7 @@ struct GeneralOperator - inline __host__ __device__ auto operator()(const Index i) const + inline HOST_DEVICE auto operator()(const Index i) const { auto lhs = _lhs(i); return lhs - _rhs; @@ -267,7 +271,7 @@ struct GeneralOperator - inline __host__ __device__ auto operator()(const Index i) const + inline HOST_DEVICE auto operator()(const Index i) const { auto rhs = _rhs(i); return _lhs - rhs; @@ -299,7 +303,7 @@ struct GeneralOperator - inline __host__ __device__ auto operator()(const Index i) const + inline HOST_DEVICE auto operator()(const Index i) const { auto rhs = _rhs(i); auto lhs = _lhs(i); @@ -332,7 +336,7 @@ struct GeneralOperator - inline __host__ __device__ auto operator()(const Index i) const + inline HOST_DEVICE auto operator()(const Index i) const { auto lhs = _lhs(i); return lhs * _rhs; @@ -366,7 +370,7 @@ struct GeneralOperator - inline __host__ __device__ auto operator()(const Index i) const + inline HOST_DEVICE auto operator()(const Index i) const { auto rhs = _rhs(i); return _lhs * rhs; @@ -398,7 +402,7 @@ struct GeneralOperator - inline __host__ __device__ auto operator()(const Index i) const + inline HOST_DEVICE auto operator()(const Index i) const { auto rhs = _rhs(i); auto lhs = _lhs(i); @@ -430,7 +434,7 @@ struct GeneralOperator - inline __host__ __device__ auto operator()(const Index i) const + inline HOST_DEVICE auto operator()(const Index i) const { auto lhs = _lhs(i); return lhs / _rhs; @@ -462,7 +466,7 @@ struct GeneralOperator - inline __host__ __device__ auto operator()(const Index i) const + inline HOST_DEVICE auto operator()(const Index i) const { auto rhs = _rhs(i); return _lhs / rhs; @@ -516,13 +520,15 @@ auto operator/(const GeneralOperator &lhs, template using isAllowedType = typename std::enable_if::value + #ifndef USE_CPU_ONLY || std::is_same::value + || std::is_same >::value + || std::is_same >::value + #endif || std::is_same::value || std::is_same::value - || std::is_same >::value || std::is_same >::value || std::is_same >::value - || std::is_same >::value || std::is_same >::value || std::is_same >::value, inputType>::type; diff --git a/src/base/math/simpleArray.h b/src/base/math/simpleArray.h index 38939763..05555fef 100644 --- a/src/base/math/simpleArray.h +++ b/src/base/math/simpleArray.h @@ -12,18 +12,18 @@ class SimpleArray{ public: - __device__ __host__ T& operator[](size_t i){ + HOST_DEVICE T& operator[](size_t i){ return values[i]; } - __device__ __host__ inline auto operator()(gSiteStack site) const + HOST_DEVICE inline auto operator()(gSiteStack site) const { return values[site.stack]; } - __device__ __host__ inline auto operator()(gSiteMu site) const + HOST_DEVICE inline auto operator()(gSiteMu site) const { return values[site.mu]; } @@ -31,32 +31,32 @@ class SimpleArray{ SimpleArray() = default; - __device__ __host__ SimpleArray(const T& init){ + HOST_DEVICE SimpleArray(const T& init){ for(size_t i = 0; i < N; i++){ values[i] = init; } } template - __device__ __host__ SimpleArray(SimpleArray s_array) { + HOST_DEVICE SimpleArray(SimpleArray s_array) { for(size_t i = 0; i < N; i++) { values[i] = s_array[i]; } } - __device__ __host__ void operator=(SimpleArray vec){ + HOST_DEVICE void operator=(SimpleArray vec){ for(size_t i = 0; i < N; i++){ values[i] = vec[i]; } } - __host__ void operator=(std::vector vec){ + HOST void operator=(std::vector vec){ for(size_t i = 0; i < N; i++){ values[i] = vec.at(i); } } - __device__ __host__ SimpleArray getAccessor() const { + HOST_DEVICE SimpleArray getAccessor() const { return *this; } @@ -65,7 +65,7 @@ class SimpleArray{ template -__host__ __device__ SimpleArray operator/(SimpleArray a, SimpleArray b){ +HOST_DEVICE SimpleArray operator/(SimpleArray a, SimpleArray b){ SimpleArray ret; for(size_t i = 0; i < N; i++){ ret[i] = a[i] / b[i]; @@ -74,7 +74,7 @@ __host__ __device__ SimpleArray operator/(SimpleArray a, S } template -__host__ __device__ SimpleArray operator*(SimpleArray a, SimpleArray b){ +HOST_DEVICE SimpleArray operator*(SimpleArray a, SimpleArray b){ SimpleArray ret; for(size_t i = 0; i < N; i++){ ret[i] = a * b; @@ -83,7 +83,7 @@ __host__ __device__ SimpleArray operator*(SimpleArray a, S } template -__host__ __device__ SimpleArray operator-(SimpleArray a, SimpleArray b){ +HOST_DEVICE SimpleArray operator-(SimpleArray a, SimpleArray b){ SimpleArray ret; for(size_t i = 0; i < N; i++){ ret[i] = a[i] - b[i]; @@ -92,7 +92,7 @@ __host__ __device__ SimpleArray operator-(SimpleArray a, S } template -__host__ __device__ SimpleArray operator+(SimpleArray a, SimpleArray b){ +HOST_DEVICE SimpleArray operator+(SimpleArray a, SimpleArray b){ SimpleArray ret; for(size_t i = 0; i < N; i++){ ret[i] = a[i] + b[i]; @@ -101,7 +101,7 @@ __host__ __device__ SimpleArray operator+(SimpleArray a, S } template -__host__ __device__ SimpleArray operator*(floatT a, SimpleArray b){ +HOST_DEVICE SimpleArray operator*(floatT a, SimpleArray b){ SimpleArray ret; for(size_t i = 0; i < N; i++){ ret[i] = a * b[i]; @@ -110,7 +110,7 @@ __host__ __device__ SimpleArray operator*(floatT a, SimpleArray - __host__ __device__ SimpleArray operator/(SimpleArray a, floatT b){ + HOST_DEVICE SimpleArray operator/(SimpleArray a, floatT b){ SimpleArray ret; for (size_t i = 0; i < N; i++) { ret[i] = a[i]/b; @@ -119,7 +119,7 @@ template } template -__host__ __device__ floatT max(SimpleArray a){ +HOST_DEVICE floatT max(SimpleArray a){ floatT ret = a[0]; for(size_t i = 1; i < N; i++){ if (a[i] > ret){ @@ -130,7 +130,7 @@ __host__ __device__ floatT max(SimpleArray a){ } template -__host__ __device__ SimpleArray real(SimpleArray c){ +HOST_DEVICE SimpleArray real(SimpleArray c){ SimpleArray ret; for(size_t i = 0; i < N; i++){ ret[i] = c[i].cREAL; diff --git a/src/base/math/su3Exp.h b/src/base/math/su3Exp.h index 651ea77d..87a08990 100644 --- a/src/base/math/su3Exp.h +++ b/src/base/math/su3Exp.h @@ -34,7 +34,7 @@ N = 25 by default due to an estimated error of order 10^(-26) */ template -__device__ __host__ constexpr unsigned int countOfApproxInverseFak(){ +HOST_DEVICE constexpr unsigned int countOfApproxInverseFak(){ unsigned int N = 1; floatT nominator = 1.0; @@ -48,7 +48,7 @@ __device__ __host__ constexpr unsigned int countOfApproxInverseFak(){ // Algorithm from https://luscher.web.cern.ch/luscher/notes/su3fcts.pdf template -__device__ __host__ inline void SU3Exp(const GSU3 inGSU3, GSU3 &outGSU3){ +HOST_DEVICE inline void SU3Exp(const GSU3 inGSU3, GSU3 &outGSU3){ constexpr unsigned int N = countOfApproxInverseFak(); floatT c_i[N+1]; diff --git a/src/base/memoryManagement.cpp b/src/base/memoryManagement.cpp index e96264df..d76b0f8c 100644 --- a/src/base/memoryManagement.cpp +++ b/src/base/memoryManagement.cpp @@ -101,6 +101,7 @@ void MemoryManagement::gMemory::copyFrom(const gMemoryPtr const size_t offsetSelf , const size_t offsetSrc) { adjustSize(sizeInBytes); +#ifndef USE_CPU_ONLY gpuError_t gpuErr; if (onDevice) { /// Device to device @@ -132,6 +133,9 @@ void MemoryManagement::gMemory::copyFrom(const gMemoryPtr GpuError("memoryManagement.h: Failed to copy data (HostToHost)", gpuErr); } } +#else + memcpy(static_cast(_rawPointer) + offsetSelf, src->getPointer(offsetSrc), sizeInBytes); +#endif } diff --git a/src/base/memoryManagement.h b/src/base/memoryManagement.h index ff44d9e8..f5fa72f0 100644 --- a/src/base/memoryManagement.h +++ b/src/base/memoryManagement.h @@ -74,7 +74,9 @@ class MemoryManagement { /// a pointer as an argument; hence the strange syntax. void alloc(size_t size) { if (size > 0) { +#ifndef USE_CPU_ONLY if (onDevice) { + gpuError_t gpuErr = gpuMalloc((void **) &_rawPointer, size); if (gpuErr != gpuSuccess) { MemoryManagement::memorySummary(false, false, true, true,false); @@ -82,22 +84,24 @@ class MemoryManagement { err_msg << "_rawPointer: Failed to allocate (additional) " << size/1000000000. << " GB of memory on device"; GpuError(err_msg.str().c_str(), gpuErr); } - } else { -#ifndef CPUONLY - gpuError_t gpuErr = gpuMallocHost((void **) &_rawPointer, size); - if (gpuErr != gpuSuccess) { + } else +#endif + { +#if defined(CPUONLY) || defined(USE_CPU_ONLY) + _rawPointer = std::malloc(size); + if (_rawPointer == nullptr){ MemoryManagement::memorySummary(true,true,false, false,false); std::stringstream err_msg; err_msg << "_rawPointer: Failed to allocate (additional) " << size/1000000000. << " GB of memory on host"; - GpuError(err_msg.str().c_str(), gpuErr); + throw std::runtime_error(stdLogger.fatal(err_msg.str())); } #else - _rawPointer = std::malloc(size); - if (_rawPointer == nullptr){ + gpuError_t gpuErr = gpuMallocHost((void **) &_rawPointer, size); + if (gpuErr != gpuSuccess) { MemoryManagement::memorySummary(true,true,false, false,false); std::stringstream err_msg; err_msg << "_rawPointer: Failed to allocate (additional) " << size/1000000000. << " GB of memory on host"; - throw std::runtime_error(stdLogger.fatal(err_msg.str())); + GpuError(err_msg.str().c_str(), gpuErr); } #endif } @@ -112,6 +116,7 @@ class MemoryManagement { /// pointer as the argument. No idea why, but I guess it doesn't matter. void free() { if (_current_size > 0) { +#ifndef USE_CPU_ONLY if (onDevice) { if (P2Ppaired) { _cIpc.destroy(); @@ -125,9 +130,12 @@ class MemoryManagement { "GB at " << static_cast(_rawPointer) << " on device"; GpuError(err_msg.str().c_str(), gpuErr); } - - } else { -#ifndef CPUONLY + } else +#endif + { +#if defined(CPUONLY) || defined(USE_CPU_ONLY) + std::free(_rawPointer); +#else gpuError_t gpuErr = gpuFreeHost(_rawPointer); if (gpuErr != gpuSuccess) { MemoryManagement::memorySummary(true,true,false, false, false); @@ -136,8 +144,6 @@ class MemoryManagement { "GB at " << static_cast(_rawPointer) << " on host"; GpuError(err_msg.str().c_str(), gpuErr); } -#else - std::free(_rawPointer); #endif } rootLogger.alloc("> Free mem at " , static_cast(_rawPointer) , " (" , (onDevice ? "Device" : "Host ") , "): " , @@ -182,8 +188,12 @@ class MemoryManagement { { if (_current_size > 0) { if (onDevice) { +#ifndef USE_CPU_ONLY gpuError_t gpuErr = gpuMemset(_rawPointer, value, _current_size); if (gpuErr != gpuSuccess) GpuError("_rawPointer: Failed to set memory on device", gpuErr); +#else + throw std::runtime_error(stdLogger.fatal("CPU backend does not support onDevice=True.")); +#endif } else { std::memset(_rawPointer, value, _current_size); } @@ -215,7 +225,9 @@ class MemoryManagement { } free(); alloc(sizeBytes); +#ifndef USE_CPU_ONLY if (P2Ppaired && onDevice) _cIpc.updateAllHandles(getPointer()); +#endif } return resize; } @@ -229,7 +241,9 @@ class MemoryManagement { } free(); alloc(sizeBytes); +#ifndef USE_CPU_ONLY if (P2Ppaired && onDevice) _cIpc.updateAllHandles(getPointer()); +#endif } return resize; @@ -269,9 +283,14 @@ class MemoryManagement { /// THIS IS ALL CUDA IPC/P2P related stuff private: +#ifndef USE_CPU_ONLY gpuIPC _cIpc; +#else + void* _cIpc = nullptr; +#endif bool P2Ppaired; +#ifndef USE_CPU_ONLY public: void initP2P(MPI_Comm comm, int myRank) { if (onDevice && _current_size != 0) { @@ -306,6 +325,7 @@ class MemoryManagement { } return nullptr; } +#endif }; @@ -553,7 +573,7 @@ class MemoryAccessor { ~MemoryAccessor() = default; template - __device__ __host__ inline void setValue(const size_t isite, const floatT value) { + HOST_DEVICE inline void setValue(const size_t isite, const floatT value) { /// reinterpret_cast is a compile time directive telling the compiler to treat _Array as a floatT*. This is /// needed because _Array is treated as void* right now. auto *arr = reinterpret_cast(Array); @@ -561,7 +581,7 @@ class MemoryAccessor { } template - __device__ __host__ inline void getValue(const size_t isite, floatT &value) { + HOST_DEVICE inline void getValue(const size_t isite, floatT &value) { auto *arr = reinterpret_cast(Array); value = arr[isite]; } diff --git a/src/base/runFunctors.h b/src/base/runFunctors.h index 44e63eb7..313a5118 100644 --- a/src/base/runFunctors.h +++ b/src/base/runFunctors.h @@ -18,6 +18,12 @@ #define DEFAULT_NBLOCKS_LOOP 128 #define DEFAULT_NBLOCKS_CONST 256 +#ifndef USE_CPU_ONLY +#define GPUSTREAM_T_ gpuStream_t +#else +#define GPUSTREAM_T_ void* +#endif + template class RunFunctors { public: @@ -25,30 +31,30 @@ class RunFunctors { template void iterateWithConstObject(Object ob, CalcReadInd calcReadInd, CalcWriteInd calcWriteInd, - const size_t elems_x, const size_t elems_y = 1, const size_t elems_z = 1,gpuStream_t stream = (gpuStream_t)nullptr); + const size_t elems_x, const size_t elems_y = 1, const size_t elems_z = 1,GPUSTREAM_T_ stream = (GPUSTREAM_T_)nullptr); template void iterateFunctor(Functor op, CalcReadInd calcReadInd, CalcWriteInd calcWriteInd, - const size_t elems_x, const size_t elems_y = 1, const size_t elems_z = 1,gpuStream_t stream = (gpuStream_t)nullptr); + const size_t elems_x, const size_t elems_y = 1, const size_t elems_z = 1,GPUSTREAM_T_ stream = (GPUSTREAM_T_)nullptr); template void iterateFunctorLoop(Functor op, CalcReadInd calcReadInd, CalcWriteInd calcWriteInd, - const size_t elems_x, const size_t elems_y = 1, const size_t elems_z = 1,gpuStream_t stream = (gpuStream_t)nullptr, size_t Nmax=Nloops); + const size_t elems_x, const size_t elems_y = 1, const size_t elems_z = 1,GPUSTREAM_T_ stream = (GPUSTREAM_T_)nullptr, size_t Nmax=Nloops); }; #ifdef __GPUCC__ #ifdef USE_HIP_AMD -__host__ __device__ static inline HIP_vector_type GetUint3(dim3 Idx){ +HOST_DEVICE static inline HIP_vector_type GetUint3(dim3 Idx){ return HIP_vector_type(Idx.x, Idx.y, Idx.z); }; #elif defined USE_HIP_NVIDIA -__host__ __device__ static dim3 GetUint3(dim3 Idx){ +HOST_DEVICE static dim3 GetUint3(dim3 Idx){ return Idx; @@ -121,7 +127,6 @@ __global__ void performCopyConstObject(Accessor res, Object ob, CalcReadInd calc } #endif - template template void RunFunctors::iterateFunctor(Functor op, CalcReadInd calcReadInd, @@ -129,7 +134,7 @@ void RunFunctors::iterateFunctor(Functor op, CalcReadInd cal const size_t elems_x, const size_t elems_y, const size_t elems_z, - __attribute__((unused)) gpuStream_t stream){ + __attribute__((unused)) GPUSTREAM_T_ stream){ dim3 blockDim; @@ -141,6 +146,7 @@ void RunFunctors::iterateFunctor(Functor op, CalcReadInd cal const dim3 gridDim = static_cast (ceilf(static_cast (elems_x) / static_cast (blockDim.x))); +#ifndef USE_CPU_ONLY if (onDevice) { #ifdef __GPUCC__ @@ -155,7 +161,9 @@ void RunFunctors::iterateFunctor(Functor op, CalcReadInd cal #else static_assert(!onDevice, "Functor construction not available for device code outside .cpp files"); #endif - } else { + } else +#endif + { auto resAcc = getAccessor(); uint3 blockIdx; blockIdx.y = 0; @@ -193,7 +201,7 @@ void RunFunctors::iterateFunctor(Functor op, CalcReadInd cal template template void RunFunctors::iterateFunctorLoop(Functor op, - CalcReadInd calcReadInd, CalcWriteInd calcWriteInd, const size_t elems_x, const size_t elems_y, const size_t elems_z,__attribute__((unused)) gpuStream_t stream, size_t Nmax) { + CalcReadInd calcReadInd, CalcWriteInd calcWriteInd, const size_t elems_x, const size_t elems_y, const size_t elems_z,__attribute__((unused)) GPUSTREAM_T_ stream, size_t Nmax) { dim3 blockDim; @@ -211,6 +219,7 @@ void RunFunctors::iterateFunctorLoop(Functor op, const dim3 gridDim = static_cast (ceilf(static_cast (elems_x) / static_cast (blockDim.x))); +#ifndef USE_CPU_ONLY if (onDevice) { #ifdef __GPUCC__ @@ -226,7 +235,9 @@ void RunFunctors::iterateFunctorLoop(Functor op, #else static_assert(!onDevice, "Functor construction not available for device code outside .cpp files"); #endif - } else { + } else +#endif + { auto resAcc = getAccessor(); uint3 blockIdx; blockIdx.y = 0; @@ -272,7 +283,7 @@ void RunFunctors::iterateWithConstObject(Object ob, CalcRead const size_t elems_x, const size_t elems_y, const size_t elems_z, - __attribute__((unused)) gpuStream_t stream ){ + __attribute__((unused)) GPUSTREAM_T_ stream ){ dim3 blockDim; @@ -283,7 +294,7 @@ void RunFunctors::iterateWithConstObject(Object ob, CalcRead //Grid only in x direction! const dim3 gridDim = static_cast (ceilf(static_cast (elems_x) / static_cast (blockDim.x))); - +#ifndef USE_CPU_ONLY if (onDevice) { #ifdef __GPUCC__ @@ -300,7 +311,9 @@ void RunFunctors::iterateWithConstObject(Object ob, CalcRead #else static_assert(!onDevice, "Functor construction not available for device code outside .cpp files"); #endif - } else { + } else +#endif + { auto resAcc = getAccessor(); uint3 blockIdx; blockIdx.y = 0; @@ -360,7 +373,7 @@ template (ceilf(static_cast (elems_x) / static_cast (blockDim.x))); +#ifndef USE_CPU_ONLY if (onDevice) { #ifdef __GPUCC__ @@ -388,7 +402,9 @@ void iterateFunctorNoReturn(Functor op, CalcReadInd calcReadInd, const size_t el #else static_assert(!onDevice, "Functor construction not available for device code outside .cpp files"); #endif - } else { + } else +#endif + { uint3 blockIdx; blockIdx.y = 0; blockIdx.z = 0; @@ -444,7 +460,7 @@ template struct CalcGSiteFull { template - inline __host__ __device__ gSite operator()(Args... args) { + inline HOST_DEVICE gSite operator()(Args... args) { gSite site = GIndexer::getSiteFull(args...); return site; } @@ -522,7 +538,7 @@ struct CalcGSiteFull { template struct CalcGSite { template - inline __host__ __device__ gSite operator()(Args... args) { + inline HOST_DEVICE gSite operator()(Args... args) { gSite site = GIndexer::getSite(args...); return site; } @@ -531,7 +547,7 @@ struct CalcGSite { template struct CalcGSiteSpatialFull { template - inline __host__ __device__ gSite operator()(Args... args) { + inline HOST_DEVICE gSite operator()(Args... args) { gSite site = GIndexer::getSiteSpatialFull(args...); return site; } @@ -540,7 +556,7 @@ struct CalcGSiteSpatialFull { template struct CalcGSiteSpatial { template - inline __host__ __device__ gSite operator()(Args... args) { + inline HOST_DEVICE gSite operator()(Args... args) { gSite site = GIndexer::getSiteSpatial(args...); return site; } @@ -549,7 +565,7 @@ struct CalcGSiteSpatial { template struct CalcGSiteStack { template - inline __host__ __device__ gSiteStack operator()(Args... args) { + inline HOST_DEVICE gSiteStack operator()(Args... args) { gSiteStack site = GIndexer::getSiteStack(args...); return site; } @@ -558,7 +574,7 @@ struct CalcGSiteStack { template struct CalcGSiteStackFull { template - inline __host__ __device__ gSiteStack operator()(Args... args) { + inline HOST_DEVICE gSiteStack operator()(Args... args) { gSiteStack site = GIndexer::getSiteStackFull(args...); return site; } @@ -567,7 +583,7 @@ struct CalcGSiteStackFull { template struct CalcGSiteAllMu { template - inline __host__ __device__ gSiteMu operator()(Args... args) { + inline HOST_DEVICE gSiteMu operator()(Args... args) { gSiteMu site = GIndexer::getSiteMu(args...); return site; } @@ -576,7 +592,7 @@ struct CalcGSiteAllMu { template struct CalcGSiteAtMu { template - inline __host__ __device__ gSiteMu operator()(Args... args) { + inline HOST_DEVICE gSiteMu operator()(Args... args) { gSiteMu site = GIndexer::getSiteMu(args..., mu); return site; } @@ -586,7 +602,7 @@ template struct CalcGSiteAllMuFull { template - inline __host__ __device__ gSiteMu operator()(Args... args) { + inline HOST_DEVICE gSiteMu operator()(Args... args) { gSiteMu site = GIndexer::getSiteMuFull(args...); return site; } @@ -595,7 +611,7 @@ struct CalcGSiteAllMuFull { template struct CalcGSiteAtMuFull { template - inline __host__ __device__ gSiteMu operator()(Args... args) { + inline HOST_DEVICE gSiteMu operator()(Args... args) { gSiteMu site = GIndexer::getSiteMuFull(args..., mu); return site; } @@ -604,7 +620,7 @@ struct CalcGSiteAtMuFull { template struct CalcGSiteAtStackFull { template - inline __host__ __device__ gSiteStack operator()(Args... args) { + inline HOST_DEVICE gSiteStack operator()(Args... args) { gSiteStack site = GIndexer::getSiteStackFull(args..., stack); return site; } @@ -613,7 +629,7 @@ struct CalcGSiteAtStackFull { template struct CalcGSiteAtStack { template - inline __host__ __device__ gSiteStack operator()(Args... args) { + inline HOST_DEVICE gSiteStack operator()(Args... args) { gSiteStack site = GIndexer::getSiteStack(args..., stack); return site; } @@ -623,7 +639,7 @@ struct CalcGSiteAtStack { template struct CalcOddGSiteAtStack { template - inline __host__ __device__ gSiteStack operator()(Args... args) { + inline HOST_DEVICE gSiteStack operator()(Args... args) { gSiteStack site = GIndexer::getSiteStackOdd(args..., stack); return site; } @@ -633,7 +649,7 @@ struct CalcOddGSiteAtStack { template struct CalcGSiteLoopMu { template - inline __host__ __device__ gSite operator()(Args... args) { + inline HOST_DEVICE gSite operator()(Args... args) { gSite site = GIndexer::getSite(args...); return site; } @@ -642,7 +658,7 @@ struct CalcGSiteLoopMu { template struct CalcGSiteLoopStack { template - inline __host__ __device__ gSite operator()(Args... args) { + inline HOST_DEVICE gSite operator()(Args... args) { gSite site = GIndexer::getSite(args...); return site; } @@ -651,7 +667,7 @@ struct CalcGSiteLoopStack { template struct CalcGSiteLoopMuFull { template - inline __host__ __device__ gSite operator()(Args... args) { + inline HOST_DEVICE gSite operator()(Args... args) { gSite site = GIndexer::getSiteFull(args...); return site; } @@ -660,7 +676,7 @@ struct CalcGSiteLoopMuFull { template struct CalcGSiteLoopStackFull { template - inline __host__ __device__ gSite operator()(Args... args) { + inline HOST_DEVICE gSite operator()(Args... args) { gSite site = GIndexer::getSiteFull(args...); return site; } @@ -669,39 +685,39 @@ struct CalcGSiteLoopStackFull { //! use this if you don't actually need to read in from any site, for example when initializing point sources template struct ReadDummy { - template inline __host__ __device__ gSite operator()(__attribute__((unused)) Args... args) { + template inline HOST_DEVICE gSite operator()(__attribute__((unused)) Args... args) { return GIndexer::getSite(99999,99999,99999,99999); } }; template struct WriteAtLoopMu { - inline __host__ __device__ gSiteMu operator()(const gSite &site, size_t mu) { + inline HOST_DEVICE gSiteMu operator()(const gSite &site, size_t mu) { return GIndexer::getSiteMu(site, mu); } }; template struct WriteAtLoopStack { - inline __host__ __device__ gSiteStack operator()(const gSite &site, size_t stack) { + inline HOST_DEVICE gSiteStack operator()(const gSite &site, size_t stack) { return GIndexer::getSiteStack(site, stack); } }; struct WriteAtRead { - inline __host__ __device__ gSite operator()(const gSite &site) { + inline HOST_DEVICE gSite operator()(const gSite &site) { return site; } }; struct WriteAtReadStack { - inline __host__ __device__ gSiteStack operator()(const gSiteStack &site) { + inline HOST_DEVICE gSiteStack operator()(const gSiteStack &site) { return site; } }; struct WriteAtReadMu { - inline __host__ __device__ gSiteMu operator()(const gSiteMu &siteMu) { + inline HOST_DEVICE gSiteMu operator()(const gSiteMu &siteMu) { return siteMu; } }; @@ -712,7 +728,7 @@ template struct WriteAtFixedSite { const gSite _fixed_site; explicit WriteAtFixedSite(const gSite mysite) : _fixed_site(mysite) {} - inline __host__ __device__ gSite operator()(__attribute__((unused)) const gSite dummy) { + inline HOST_DEVICE gSite operator()(__attribute__((unused)) const gSite dummy) { return _fixed_site; } }; diff --git a/src/define.h b/src/define.h index 109dcc5e..6f8fa743 100644 --- a/src/define.h +++ b/src/define.h @@ -24,12 +24,44 @@ #define RETa_IF_SCALAR { return a; } #endif - - #define STRINGIFY(x) #x #define TOSTRING(x) STRINGIFY(x) #define AT __FILE__ ":" TOSTRING(__LINE__) +#ifdef USE_CPU_ONLY +#define HOST_DEVICE +#define HOST +#define DEVICE +#define CONSTANT const +#define GPUERROR_T void* + +struct float2 { + float x; float y; +}; +struct double2 { + double x; double y; +}; +struct uint3 { + unsigned int x, y, z; +}; +struct uint4 { + unsigned int x, y, z, w; + +}; +struct dim3 { + unsigned int x, y, z; + constexpr dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {} + constexpr dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {} + constexpr operator uint3(void) const { return uint3{x, y, z}; } +}; +#else +#define HOST_DEVICE __host__ __device__ +#define HOST __host__ +#define DEVICE __device__ +#define CONSTANT __constant__ +#define GPUERROR_T gpuError_t +#endif + enum Layout { All, Even, Odd diff --git a/src/gauge/GaugeAction.cpp b/src/gauge/GaugeAction.cpp index 20984b98..c9b03221 100644 --- a/src/gauge/GaugeAction.cpp +++ b/src/gauge/GaugeAction.cpp @@ -65,7 +65,7 @@ MemoryAccessor GaugeAction::getRectangleField( template -__host__ floatT GaugeAction::barePlaquette() { +HOST floatT GaugeAction::barePlaquette() { if (recompute) { _redBase.template iterateOverBulk( @@ -77,7 +77,7 @@ __host__ floatT GaugeAction::barePlaquette() { } template -__host__ floatT GaugeAction::barePlaquetteSS() { +HOST floatT GaugeAction::barePlaquetteSS() { // if (recompute) { _redBase.template iterateOverBulk( @@ -90,7 +90,7 @@ __host__ floatT GaugeAction::barePlaquetteSS() template -__host__ floatT GaugeAction::bareUtauMinusUsigma() { +HOST floatT GaugeAction::bareUtauMinusUsigma() { if (recompute) { _redBase.template iterateOverBulk( @@ -103,7 +103,7 @@ __host__ floatT GaugeAction::bareUtauMinusUsig template -__host__ floatT GaugeAction::bareClover() { +HOST floatT GaugeAction::bareClover() { if (recompute) { _redBase.template iterateOverBulk( @@ -115,7 +115,7 @@ __host__ floatT GaugeAction::bareClover() { } template -__host__ floatT GaugeAction::bareRectangle() { +HOST floatT GaugeAction::bareRectangle() { if (recompute) { _redBase.template iterateOverBulk( diff --git a/src/gauge/GaugeAction.h b/src/gauge/GaugeAction.h index 11f8d73a..ba5dab23 100644 --- a/src/gauge/GaugeAction.h +++ b/src/gauge/GaugeAction.h @@ -24,12 +24,12 @@ class GaugeAction { template MemoryAccessor getField(); - __host__ floatT barePlaquette(); - __host__ floatT bareUtauMinusUsigma(); - __host__ floatT bareClover(); - __host__ floatT bareRectangle(); + HOST floatT barePlaquette(); + HOST floatT bareUtauMinusUsigma(); + HOST floatT bareClover(); + HOST floatT bareRectangle(); - __host__ floatT barePlaquetteSS(); + HOST floatT barePlaquetteSS(); public: diff --git a/src/gauge/constructs/PlaqConstructs.h b/src/gauge/constructs/PlaqConstructs.h index 6d69248f..1627abee 100644 --- a/src/gauge/constructs/PlaqConstructs.h +++ b/src/gauge/constructs/PlaqConstructs.h @@ -15,7 +15,7 @@ template -__host__ __device__ GSU3 inline Plaq_P(gaugeAccessor gAcc, gSite site, int mu, int nu) { +HOST_DEVICE GSU3 inline Plaq_P(gaugeAccessor gAcc, gSite site, int mu, int nu) { typedef GIndexer GInd; GSU3 temp; @@ -38,7 +38,7 @@ __host__ __device__ GSU3 inline Plaq_P(gaugeAccessor gAcc, return temp; } template -__host__ __device__ GSU3 inline Plaq_Q(gaugeAccessor gAcc, gSite site, int mu, int nu) { +HOST_DEVICE GSU3 inline Plaq_Q(gaugeAccessor gAcc, gSite site, int mu, int nu) { typedef GIndexer GInd; GSU3 temp; @@ -62,7 +62,7 @@ __host__ __device__ GSU3 inline Plaq_Q(gaugeAccessor gAcc, } template -__host__ __device__ GSU3 inline Plaq_R(gaugeAccessor gAcc, gSite site, int mu, int nu) { +HOST_DEVICE GSU3 inline Plaq_R(gaugeAccessor gAcc, gSite site, int mu, int nu) { typedef GIndexer GInd; GSU3 temp; @@ -85,7 +85,7 @@ __host__ __device__ GSU3 inline Plaq_R(gaugeAccessor gAcc, return temp; } template -__host__ __device__ GSU3 inline Plaq_S(gaugeAccessor gAcc, gSite site, int mu, int nu) { +HOST_DEVICE GSU3 inline Plaq_S(gaugeAccessor gAcc, gSite site, int mu, int nu) { typedef GIndexer GInd; GSU3 temp; diff --git a/src/gauge/constructs/derivative3link.h b/src/gauge/constructs/derivative3link.h index 0290e4b2..13cccdf4 100644 --- a/src/gauge/constructs/derivative3link.h +++ b/src/gauge/constructs/derivative3link.h @@ -7,7 +7,7 @@ #include "../../base/math/gaugeAccessor.h" template -__host__ __device__ GSU3 linkDerivative3(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu) { +HOST_DEVICE GSU3 linkDerivative3(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu) { typedef GIndexer GInd; GSU3 temp; gSite origin = site; diff --git a/src/gauge/constructs/derivative5link.h b/src/gauge/constructs/derivative5link.h index da578b7e..38525557 100644 --- a/src/gauge/constructs/derivative5link.h +++ b/src/gauge/constructs/derivative5link.h @@ -7,7 +7,7 @@ #include "../../base/math/gaugeAccessor.h" template -__host__ __device__ GSU3 linkDerivative5(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { +HOST_DEVICE GSU3 linkDerivative5(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; GSU3 temp; @@ -161,7 +161,7 @@ __host__ __device__ GSU3 linkDerivative5(gaugeAccessor gA }; template - __host__ __device__ GSU3 linkDerivative5_1(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + HOST_DEVICE GSU3 linkDerivative5_1(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLink(GInd::getSiteMu(GInd::site_up(site,mu), rho)) *gAcc.getLink(GInd::getSiteMu(GInd::site_up_up(site,mu,rho), nu)) @@ -171,7 +171,7 @@ template - __host__ __device__ GSU3 linkDerivative5_3(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + HOST_DEVICE GSU3 linkDerivative5_3(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLinkDagger(GInd::getSiteMu( GInd::site_up_dn(site,mu,rho), rho)) *gAcc.getLinkDagger(GInd::getSiteMu( GInd::site_up_dn_dn(site,mu,nu,rho), nu)) @@ -181,7 +181,7 @@ template - __host__ __device__ GSU3 linkDerivative5_5(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + HOST_DEVICE GSU3 linkDerivative5_5(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLink(GInd::getSiteMu(GInd::site_up(site,mu), rho)) *gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_up_up_dn(site,mu,rho,nu), nu)) @@ -191,7 +191,7 @@ template - __host__ __device__ GSU3 linkDerivative5_7(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + HOST_DEVICE GSU3 linkDerivative5_7(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLinkDagger(GInd::getSiteMu( GInd::site_up_dn(site,mu,rho), rho)) *gAcc.getLink(GInd::getSiteMu( GInd::site_up_dn(site,mu,rho), nu)) @@ -201,7 +201,7 @@ template - __host__ __device__ GSU3 linkDerivative5_9(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + HOST_DEVICE GSU3 linkDerivative5_9(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLink(GInd::getSiteMu(GInd::site_up(site,mu), nu)) *finAccessor.getLink(GInd::getSiteMu(GInd::site_up_up_dn(site,mu,nu,rho), rho)) @@ -211,7 +211,7 @@ template - __host__ __device__ GSU3 linkDerivative5_11(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + HOST_DEVICE GSU3 linkDerivative5_11(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_up_dn(site,mu,nu), nu)) *finAccessor.getLinkDagger(GInd::getSiteMu(GInd::site_up_dn(site,mu,nu), rho)) @@ -221,7 +221,7 @@ template - __host__ __device__ GSU3 linkDerivative5_13(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + HOST_DEVICE GSU3 linkDerivative5_13(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_up_dn(site,mu,nu), nu)) *finAccessor.getLink(GInd::getSiteMu( GInd::site_up_dn_dn(site,mu,nu,rho),rho)) @@ -231,7 +231,7 @@ template - __host__ __device__ GSU3 linkDerivative5_15(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + HOST_DEVICE GSU3 linkDerivative5_15(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLink(GInd::getSiteMu(GInd::site_up(site,mu), nu)) *finAccessor.getLinkDagger(GInd::getSiteMu(GInd::site_up_up(site,mu,nu), rho)) @@ -241,7 +241,7 @@ template - __host__ __device__ GSU3 linkDerivative5_17(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + HOST_DEVICE GSU3 linkDerivative5_17(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return finAccessor.getLink(GInd::getSiteMu(GInd::site_up_dn(site,mu,nu), nu)) *gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_dn(site,nu), mu)) @@ -251,7 +251,7 @@ template - __host__ __device__ GSU3 linkDerivative5_19(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + HOST_DEVICE GSU3 linkDerivative5_19(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return finAccessor.getLinkDagger(GInd::getSiteMu(GInd::site_up(site,mu), nu)) *gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_up(site,nu), mu)) @@ -262,7 +262,7 @@ template - __host__ __device__ GSU3 linkDerivative5_21(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + HOST_DEVICE GSU3 linkDerivative5_21(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return finAccessor.getLink(GInd::getSiteMu(GInd::site_up_dn(site,mu,nu), nu)) *gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_dn(site,nu), mu)) @@ -272,7 +272,7 @@ template - __host__ __device__ GSU3 linkDerivative5_23(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + HOST_DEVICE GSU3 linkDerivative5_23(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return finAccessor.getLinkDagger(GInd::getSiteMu(GInd::site_up(site,mu), nu)) *gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_up(site,nu), mu)) @@ -282,7 +282,7 @@ template - __host__ __device__ GSU3 linkDerivative5_25(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + HOST_DEVICE GSU3 linkDerivative5_25(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLinkDagger(GInd::getSiteMu( GInd::site_up_dn(site,mu,rho), rho)) *gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_dn(site,rho), mu)) @@ -292,7 +292,7 @@ template - __host__ __device__ GSU3 linkDerivative5_27(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + HOST_DEVICE GSU3 linkDerivative5_27(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLink(GInd::getSiteMu(GInd::site_up(site,mu), rho)) *gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_up(site,rho), mu)) @@ -302,7 +302,7 @@ template - __host__ __device__ GSU3 linkDerivative5_29(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + HOST_DEVICE GSU3 linkDerivative5_29(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLinkDagger(GInd::getSiteMu( GInd::site_up_dn(site,mu,rho), rho)) *gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_dn(site,rho), mu)) @@ -312,7 +312,7 @@ template - __host__ __device__ GSU3 linkDerivative5_31(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + HOST_DEVICE GSU3 linkDerivative5_31(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLink(GInd::getSiteMu(GInd::site_up(site,mu), rho)) *gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_up(site,rho), mu)) @@ -322,7 +322,7 @@ template - __host__ __device__ GSU3 linkDerivative5_33(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + HOST_DEVICE GSU3 linkDerivative5_33(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLink(GInd::getSiteMu(GInd::site_up(site,mu), rho)) *gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_up_up_dn(site,mu,rho,nu), nu)) @@ -332,7 +332,7 @@ template - __host__ __device__ GSU3 linkDerivative5_35(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + HOST_DEVICE GSU3 linkDerivative5_35(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLinkDagger(GInd::getSiteMu( GInd::site_up_dn(site,mu,rho), rho)) *gAcc.getLink(GInd::getSiteMu( GInd::site_up_dn(site,mu,rho), nu)) @@ -342,7 +342,7 @@ template - __host__ __device__ GSU3 linkDerivative5_37(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + HOST_DEVICE GSU3 linkDerivative5_37(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLinkDagger(GInd::getSiteMu( GInd::site_up_dn(site,mu,rho), rho)) *gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_up_dn_dn(site,mu,rho,nu), nu)) @@ -352,7 +352,7 @@ template - __host__ __device__ GSU3 linkDerivative5_39(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + HOST_DEVICE GSU3 linkDerivative5_39(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLink(GInd::getSiteMu(GInd::site_up(site,mu), rho)) *gAcc.getLink(GInd::getSiteMu(GInd::site_up_up(site,mu,rho), nu)) diff --git a/src/gauge/constructs/derivative7link.h b/src/gauge/constructs/derivative7link.h index 06f9e318..c2e1d60c 100644 --- a/src/gauge/constructs/derivative7link.h +++ b/src/gauge/constructs/derivative7link.h @@ -5,7 +5,7 @@ #include "../../base/math/gaugeAccessor.h" template -__host__ __device__ GSU3 linkDerivative7(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, +HOST_DEVICE GSU3 linkDerivative7(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho, int sigma, int TermCheck = -1, int SubTermCheck = -1) { typedef GIndexer GInd; GSU3 temp = gsu3_zero(); diff --git a/src/gauge/constructs/derivativeLepagelink.h b/src/gauge/constructs/derivativeLepagelink.h index bbda330e..02198bee 100644 --- a/src/gauge/constructs/derivativeLepagelink.h +++ b/src/gauge/constructs/derivativeLepagelink.h @@ -7,7 +7,7 @@ #include "../../base/math/gaugeAccessor.h" template -__host__ __device__ GSU3 linkDerivativeLepage(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu) { +HOST_DEVICE GSU3 linkDerivativeLepage(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu) { typedef GIndexer GInd; GSU3 temp; gSite origin = site; diff --git a/src/gauge/constructs/derivativeProjectU3Constructs.h b/src/gauge/constructs/derivativeProjectU3Constructs.h index ce457863..dae1544d 100644 --- a/src/gauge/constructs/derivativeProjectU3Constructs.h +++ b/src/gauge/constructs/derivativeProjectU3Constructs.h @@ -11,7 +11,7 @@ #include "gsvd.h" template -__host__ __device__ GSU3 derivativeProjectU3(gaugeAccessor gAcc, gaugeAccessor fAcc, gSite site, int mu) { +HOST_DEVICE GSU3 derivativeProjectU3(gaugeAccessor gAcc, gaugeAccessor fAcc, gSite site, int mu) { typedef GIndexer GInd; diff --git a/src/gauge/constructs/fat7LinkConstructs.h b/src/gauge/constructs/fat7LinkConstructs.h index f63d6958..f1df761e 100644 --- a/src/gauge/constructs/fat7LinkConstructs.h +++ b/src/gauge/constructs/fat7LinkConstructs.h @@ -11,7 +11,7 @@ template - __host__ __device__ GSU3 inline naikLinkStaple(gaugeAccessor gAcc, gSiteMu siteMu) { + HOST_DEVICE GSU3 inline naikLinkStaple(gaugeAccessor gAcc, gSiteMu siteMu) { typedef GIndexer GInd; @@ -24,7 +24,7 @@ template } template - __host__ __device__ GSU3 inline threeLinkStaple(gaugeAccessor gAcc, gSiteMu siteMu) { + HOST_DEVICE GSU3 inline threeLinkStaple(gaugeAccessor gAcc, gSiteMu siteMu) { typedef GIndexer GInd; GSU3 temp = gsu3_zero(); gSite site = GInd::getSite(siteMu.isite); @@ -42,7 +42,7 @@ template } template - __host__ __device__ GSU3 inline lepageLinkStaple(gaugeAccessor gAcc, gSiteMu siteMu) { + HOST_DEVICE GSU3 inline lepageLinkStaple(gaugeAccessor gAcc, gSiteMu siteMu) { typedef GIndexer GInd; GSU3 temp = gsu3_zero(); gSite site = GInd::getSite(siteMu.isite); @@ -61,7 +61,7 @@ template template - __host__ __device__ GSU3 inline fiveLinkStaple(gaugeAccessor gAcc, gSiteMu siteMu) { + HOST_DEVICE GSU3 inline fiveLinkStaple(gaugeAccessor gAcc, gSiteMu siteMu) { typedef GIndexer GInd; GSU3 temp = gsu3_zero(); gSite site = GInd::getSite(siteMu.isite); @@ -122,7 +122,7 @@ template } template - __host__ __device__ GSU3 inline sevenLinkStaple(gaugeAccessor gAcc, gSiteMu siteMu) { + HOST_DEVICE GSU3 inline sevenLinkStaple(gaugeAccessor gAcc, gSiteMu siteMu) { typedef GIndexer GInd; GSU3 temp = gsu3_zero(); gSite site = GInd::getSite(siteMu.isite); diff --git a/src/gauge/constructs/gsvd.h b/src/gauge/constructs/gsvd.h index e5cb2ade..028783a4 100644 --- a/src/gauge/constructs/gsvd.h +++ b/src/gauge/constructs/gsvd.h @@ -89,7 +89,7 @@ * This routine eliminates off-diagonal element, handling special cases * ************************************************************************/ template -__device__ __host__ inline int svd2x2bidiag(svdfloatT *a00, svdfloatT *a01, svdfloatT *a11, svdfloatT U2[2][2], svdfloatT V2[2][2]) +HOST_DEVICE inline int svd2x2bidiag(svdfloatT *a00, svdfloatT *a01, svdfloatT *a11, svdfloatT U2[2][2], svdfloatT V2[2][2]) { register svdfloatT sinphi, cosphi, tanphi, cotphi; register svdfloatT a, b, min, max, abs00, abs01, abs11; @@ -289,7 +289,7 @@ __device__ __host__ inline int svd2x2bidiag(svdfloatT *a00, svdfloatT *a01, svdf template -__device__ __host__ GSU3 svd3x3core(const GSU3& AA, floatT* sv){ +HOST_DEVICE GSU3 svd3x3core(const GSU3& AA, floatT* sv){ /****************************************** * sigma[3] -- singular values, * diff --git a/src/gauge/constructs/hisqForceConstructs.h b/src/gauge/constructs/hisqForceConstructs.h index 6ab14198..772b5ebe 100644 --- a/src/gauge/constructs/hisqForceConstructs.h +++ b/src/gauge/constructs/hisqForceConstructs.h @@ -8,7 +8,7 @@ #include "derivativeLepagelink.h" template - __host__ __device__ GSU3 smearingForce(gaugeAccessor gAcc, gaugeAccessor finAccessor, + HOST_DEVICE GSU3 smearingForce(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, SmearingParameters _smearparam, int TermCheck = -1, int SubTermCheck = -1, bool doL1 = true, bool doL3 = true, bool doL5 = true, bool doL7 = true, bool doLLp = true) { @@ -66,7 +66,7 @@ template - __host__ __device__ GSU3 threeLinkContribution(gaugeAccessor gAcc, gaugeAccessor finAccessor, + HOST_DEVICE GSU3 threeLinkContribution(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, SmearingParameters _smearparam) { typedef GIndexer GInd; floatT c1 =_smearparam._c_1; @@ -81,7 +81,7 @@ template __host__ __device__ GSU3 lepagelinkContribution(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c_lp) { +template HOST_DEVICE GSU3 lepagelinkContribution(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c_lp) { GSU3 derivative_lp = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { int nu = (mu+nu_h)%4; @@ -90,7 +90,7 @@ template __host__ __device__ GSU3 sevenLinkContribution(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7, int Term, int SubTerm) { +template HOST_DEVICE GSU3 sevenLinkContribution(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7, int Term, int SubTerm) { GSU3 sevenlinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { int nu = (mu + nu_h)%4; @@ -103,7 +103,7 @@ template __host__ __device return -c7*sevenlinkCont; }; -template __host__ __device__ GSU3 fiveLinkContribution(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5, int part) { +template HOST_DEVICE GSU3 fiveLinkContribution(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5, int part) { typedef GIndexer GInd; GSU3 fivelinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -124,7 +124,7 @@ template __host__ __device return finAccessor.getLink(GInd::getSiteMu(site,mu))+c5*fivelinkCont; }; -template __host__ __device__ GSU3 fiveLinkContribution_11(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { +template HOST_DEVICE GSU3 fiveLinkContribution_11(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { typedef GIndexer GInd; GSU3 fivelinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -147,7 +147,7 @@ template __host__ __device return c5*fivelinkCont; }; -template __host__ __device__ GSU3 fiveLinkContribution_12(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { +template HOST_DEVICE GSU3 fiveLinkContribution_12(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { typedef GIndexer GInd; GSU3 fivelinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -170,7 +170,7 @@ template __host__ __device return c5*fivelinkCont; }; -template __host__ __device__ GSU3 fiveLinkContribution_13(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { +template HOST_DEVICE GSU3 fiveLinkContribution_13(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { typedef GIndexer GInd; GSU3 fivelinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -193,7 +193,7 @@ template __host__ __device return c5*fivelinkCont; }; -template __host__ __device__ GSU3 fiveLinkContribution_14(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { +template HOST_DEVICE GSU3 fiveLinkContribution_14(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { typedef GIndexer GInd; GSU3 fivelinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -217,7 +217,7 @@ template __host__ __device }; -template __host__ __device__ GSU3 fiveLinkContribution_20(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { +template HOST_DEVICE GSU3 fiveLinkContribution_20(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { typedef GIndexer GInd; GSU3 fivelinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -275,7 +275,7 @@ template __host__ __device return c5*fivelinkCont; }; -template __host__ __device__ GSU3 fiveLinkContribution_30(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { +template HOST_DEVICE GSU3 fiveLinkContribution_30(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { typedef GIndexer GInd; GSU3 fivelinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -326,7 +326,7 @@ template __host__ __device return c5*fivelinkCont; }; -template __host__ __device__ GSU3 sevenLinkContribution_1(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { +template HOST_DEVICE GSU3 sevenLinkContribution_1(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { typedef GIndexer GInd; GSU3 sevenlinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -403,7 +403,7 @@ template __host__ __device return -c7*sevenlinkCont; }; -template __host__ __device__ GSU3 sevenLinkContribution_2(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { +template HOST_DEVICE GSU3 sevenLinkContribution_2(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { typedef GIndexer GInd; GSU3 sevenlinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -476,7 +476,7 @@ template __host__ __device return -c7*sevenlinkCont; }; -template __host__ __device__ GSU3 sevenLinkContribution_3(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { +template HOST_DEVICE GSU3 sevenLinkContribution_3(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { typedef GIndexer GInd; GSU3 sevenlinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -551,7 +551,7 @@ template __host__ __device -template __host__ __device__ GSU3 sevenLinkContribution_4(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { +template HOST_DEVICE GSU3 sevenLinkContribution_4(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { typedef GIndexer GInd; GSU3 sevenlinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -625,7 +625,7 @@ template __host__ __device return -c7*sevenlinkCont; }; -template __host__ __device__ GSU3 sevenLinkContribution_5(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { +template HOST_DEVICE GSU3 sevenLinkContribution_5(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { typedef GIndexer GInd; GSU3 sevenlinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -695,7 +695,7 @@ template __host__ __device return -c7*sevenlinkCont; }; -template __host__ __device__ GSU3 sevenLinkContribution_6(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { +template HOST_DEVICE GSU3 sevenLinkContribution_6(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { typedef GIndexer GInd; GSU3 sevenlinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -764,7 +764,7 @@ template __host__ __device return -c7*sevenlinkCont; }; -template __host__ __device__ GSU3 sevenLinkContribution_7(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { +template HOST_DEVICE GSU3 sevenLinkContribution_7(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { typedef GIndexer GInd; GSU3 sevenlinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { diff --git a/src/gauge/constructs/linkLepageConstructs.h b/src/gauge/constructs/linkLepageConstructs.h index bfee8028..7bfd7863 100644 --- a/src/gauge/constructs/linkLepageConstructs.h +++ b/src/gauge/constructs/linkLepageConstructs.h @@ -14,7 +14,7 @@ template - __device__ GSU3 inline linkLpUp(gaugeAccessor gAcc, gSite site, int mu, int nu) { + DEVICE GSU3 inline linkLpUp(gaugeAccessor gAcc, gSite site, int mu, int nu) { typedef GIndexer GInd; gSite origin = site; @@ -54,7 +54,7 @@ template } template - __device__ GSU3 inline linkLpDn(gaugeAccessor gAcc, gSite site, int mu, int nu) { + DEVICE GSU3 inline linkLpDn(gaugeAccessor gAcc, gSite site, int mu, int nu) { typedef GIndexer GInd; gSite dn = GInd::site_dn(site, nu); gSite twoDn = GInd::site_dn(dn, nu); diff --git a/src/gauge/constructs/linkStaple3Constructs.h b/src/gauge/constructs/linkStaple3Constructs.h index 8f6b29b3..47b42694 100644 --- a/src/gauge/constructs/linkStaple3Constructs.h +++ b/src/gauge/constructs/linkStaple3Constructs.h @@ -16,7 +16,7 @@ template - __device__ GSU3 inline linkStaple3Up(gaugeAccessor gAcc, gSite site, int mu, int nu) { + DEVICE GSU3 inline linkStaple3Up(gaugeAccessor gAcc, gSite site, int mu, int nu) { typedef GIndexer GInd; GSU3 temp; @@ -48,7 +48,7 @@ template } template - __device__ GSU3 inline linkStaple3Dn(gaugeAccessor gAcc, gSite site, int mu, int nu) { + DEVICE GSU3 inline linkStaple3Dn(gaugeAccessor gAcc, gSite site, int mu, int nu) { typedef GIndexer GInd; GSU3 temp; diff --git a/src/gauge/constructs/linkStaple5Constructs.h b/src/gauge/constructs/linkStaple5Constructs.h index ebce6460..a0a6af34 100644 --- a/src/gauge/constructs/linkStaple5Constructs.h +++ b/src/gauge/constructs/linkStaple5Constructs.h @@ -14,7 +14,7 @@ #include "linkStaple3Constructs.h" template - __device__ GSU3 inline linkStaple5Up(gaugeAccessor gAcc, gSite site, int mu, int nu, int rho, + DEVICE GSU3 inline linkStaple5Up(gaugeAccessor gAcc, gSite site, int mu, int nu, int rho, int gamma) { typedef GIndexer GInd; diff --git a/src/gauge/constructs/linkStaple7Constructs.h b/src/gauge/constructs/linkStaple7Constructs.h index c5674d4a..d63be220 100644 --- a/src/gauge/constructs/linkStaple7Constructs.h +++ b/src/gauge/constructs/linkStaple7Constructs.h @@ -15,7 +15,7 @@ #include "linkStaple5Constructs.h" template - __device__ GSU3 inline linkStaple7Up(gaugeAccessor gAcc,gSite site, int mu, int nu, int rho, int gamma){ + DEVICE GSU3 inline linkStaple7Up(gaugeAccessor gAcc,gSite site, int mu, int nu, int rho, int gamma){ typedef GIndexer GInd; GSU3 staple5=gsu3_zero(); diff --git a/src/gauge/constructs/naikConstructs.h b/src/gauge/constructs/naikConstructs.h index c41915c6..0a8351e5 100644 --- a/src/gauge/constructs/naikConstructs.h +++ b/src/gauge/constructs/naikConstructs.h @@ -13,7 +13,7 @@ #include "../gaugefield.h" template -__device__ GSU3 inline naik3LinkUp(gaugeAccessor gAcc, gSite site, int mu) { +DEVICE GSU3 inline naik3LinkUp(gaugeAccessor gAcc, gSite site, int mu) { typedef GIndexer GInd; GSU3 temp; @@ -33,7 +33,7 @@ __device__ GSU3 inline naik3LinkUp(gaugeAccessor gAcc, gSite sit } /* template -__device__ GSU3 inline naik3LinkDn(gaugeAccessor gAcc, gSite site, int mu) { +DEVICE GSU3 inline naik3LinkDn(gaugeAccessor gAcc, gSite site, int mu) { typedef GIndexer GInd; GSU3 temp; diff --git a/src/gauge/constructs/naikDerivativeConstructs.h b/src/gauge/constructs/naikDerivativeConstructs.h index e7e10bd6..905c0269 100644 --- a/src/gauge/constructs/naikDerivativeConstructs.h +++ b/src/gauge/constructs/naikDerivativeConstructs.h @@ -14,7 +14,7 @@ __device__ inline floatT sgn_naik(gSiteMu siteMu) { }*/ template -__host__ __device__ GSU3 inline naikLinkDerivative(gaugeAccessor gAcc, +HOST_DEVICE GSU3 inline naikLinkDerivative(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu) { typedef GIndexer GInd; diff --git a/src/gauge/constructs/projectU3Constructs.h b/src/gauge/constructs/projectU3Constructs.h index 237a7422..ab3c65f1 100644 --- a/src/gauge/constructs/projectU3Constructs.h +++ b/src/gauge/constructs/projectU3Constructs.h @@ -12,7 +12,7 @@ #include "../gaugefield.h" #include "gsvd.h" template -__host__ __device__ GSU3 inline projectU3(gaugeAccessor gAcc, gSite site, int mu) { +HOST_DEVICE GSU3 inline projectU3(gaugeAccessor gAcc, gSite site, int mu) { typedef GIndexer GInd; GSU3 V; diff --git a/src/gauge/gaugeActionDeriv.h b/src/gauge/gaugeActionDeriv.h index c447eadc..29d474eb 100644 --- a/src/gauge/gaugeActionDeriv.h +++ b/src/gauge/gaugeActionDeriv.h @@ -16,7 +16,7 @@ template -__host__ __device__ GSU3 inline gaugeActionDerivPlaq(gaugeAccessor gAcc, gSite site, int mu) { +HOST_DEVICE GSU3 inline gaugeActionDerivPlaq(gaugeAccessor gAcc, gSite site, int mu) { GSU3 result = gsu3_zero(); GSU3 tmp = gsu3_zero(); @@ -32,7 +32,7 @@ __host__ __device__ GSU3 inline gaugeActionDerivPlaq(gaugeAccessor -__host__ __device__ GSU3 inline gaugeActionDerivRect(gaugeAccessor gAcc, gSite site, int mu) { +HOST_DEVICE GSU3 inline gaugeActionDerivRect(gaugeAccessor gAcc, gSite site, int mu) { typedef GIndexer GInd; GSU3 result = gsu3_zero(); GSU3 tmp = gsu3_zero(); @@ -79,7 +79,7 @@ __host__ __device__ GSU3 inline gaugeActionDerivRect(gaugeAccessor -__host__ __device__ GSU3 inline symanzikGaugeActionDeriv(gaugeAccessor latacc, gSite s, int mu) { +HOST_DEVICE GSU3 inline symanzikGaugeActionDeriv(gaugeAccessor latacc, gSite s, int mu) { typedef GIndexer GInd; // GSU3 tmp = (5. / 3.) * gaugeActionDerivPlaq(gAcc, site, mu) - // (1. / 12.) * gaugeActionDerivRect(gAcc, site, mu); @@ -167,7 +167,7 @@ __host__ __device__ GSU3 inline symanzikGaugeActionDeriv(gaugeAccessor -__host__ __device__ GSU3 inline gauge_force(gaugeAccessor latacc, gSiteMu site, floatT beta){ +HOST_DEVICE GSU3 inline gauge_force(gaugeAccessor latacc, gSiteMu site, floatT beta){ typedef GIndexer GInd; diff --git a/src/gauge/gauge_kernels.cpp b/src/gauge/gauge_kernels.cpp index 6ccf2d3c..037aa007 100644 --- a/src/gauge/gauge_kernels.cpp +++ b/src/gauge/gauge_kernels.cpp @@ -5,7 +5,7 @@ struct plaquetteKernel{ plaquetteKernel(Gaugefield &gauge) : gAcc(gauge.getAccessor()){ } - __device__ __host__ floatT operator()(gSite site) { + HOST_DEVICE floatT operator()(gSite site) { typedef GIndexer GInd; GSU3 temp; @@ -29,7 +29,7 @@ struct plaquetteKernelSS{ plaquetteKernelSS(Gaugefield &gauge) : gAcc(gauge.getAccessor()){ } - __device__ __host__ floatT operator()(gSite site) { + HOST_DEVICE floatT operator()(gSite site) { typedef GIndexer GInd; GSU3 temp; @@ -53,7 +53,7 @@ struct plaquetteKernel_double{ plaquetteKernel_double(Gaugefield &gauge) : gAcc(gauge.getAccessor()){ } - __device__ __host__ double operator()(gSite site) { + HOST_DEVICE double operator()(gSite site) { typedef GIndexer GInd; double result = 0; @@ -77,7 +77,7 @@ struct UtauMinusUsigmaKernel{ UtauMinusUsigmaKernel(Gaugefield &gauge) : gAcc(gauge.getAccessor()){ } - __device__ __host__ floatT operator()(gSite site) { + HOST_DEVICE floatT operator()(gSite site) { typedef GIndexer GInd; GSU3 temp; @@ -106,7 +106,7 @@ struct cloverKernel{ cloverKernel(Gaugefield &gauge) : gAcc(gauge.getAccessor()), FT(gAcc){ } - __device__ __host__ floatT operator()(gSite site) { + HOST_DEVICE floatT operator()(gSite site) { GSU3 Fmunu; @@ -130,7 +130,7 @@ struct rectangleKernel{ rectangleKernel(Gaugefield &gauge) : gAcc(gauge.getAccessor()){ } - __device__ __host__ floatT operator()(gSite site) { + HOST_DEVICE floatT operator()(gSite site) { typedef GIndexer GInd; GSU3 temp; @@ -164,7 +164,7 @@ struct rectangleKernel_double{ rectangleKernel_double(Gaugefield &gauge) : gAcc(gauge.getAccessor()){ } - __device__ __host__ double operator()(gSite site) { + HOST_DEVICE double operator()(gSite site) { typedef GIndexer GInd; GSU3 temp; @@ -204,7 +204,7 @@ struct gaugeActKernel_double{ gaugeActKernel_double(Gaugefield &gauge) : gAcc(gauge.getAccessor()){ } - __device__ __host__ double operator()(gSite site) { + HOST_DEVICE double operator()(gSite site) { typedef GIndexer GInd; GSU3 m_0; @@ -279,7 +279,7 @@ struct count_faulty_links { floatT tol; count_faulty_links(Gaugefield &GaugeL, Gaugefield &GaugeR, floatT tolerance=1e-6) : gL(GaugeL.getAccessor()), gR(GaugeR.getAccessor()), tol(tolerance) {} - __host__ __device__ int operator() (gSite site) { + HOST_DEVICE int operator() (gSite site) { int sum = 0; for (int mu = 0; mu < 4; mu++) { gSiteMu siteMu = GIndexer::getSiteMu(site,mu); diff --git a/src/gauge/gaugefield.h b/src/gauge/gaugefield.h index 375e0d75..d210f28a 100644 --- a/src/gauge/gaugefield.h +++ b/src/gauge/gaugefield.h @@ -102,8 +102,11 @@ class Gaugefield : public siteComm template void iterateOverFullAllMu(Functor op); +//TODO fix this for backend=cpu +#ifndef USE_CPU_ONLY template deviceStream iterateOverBulkAllMu(Functor op, bool useStream = false); +#endif template void iterateOverFullLoopMu(Functor op); @@ -136,7 +139,7 @@ struct convert_prec { convert_prec(Gaugefield &gaugeIn) : gAcc_source(gaugeIn.getAccessor()) {} - __device__ __host__ GSU3 operator()(gSiteMu site) { + HOST_DEVICE GSU3 operator()(gSiteMu site) { return gAcc_source.template getLink(site); } }; @@ -162,6 +165,8 @@ void Gaugefield::iterateOverFullAllMu(Functor this->template iterateFunctor(op, calcGSiteAllMuFull, writeAtReadMu, GInd::getLatData().vol4Full, 4); } +//TODO fix this for backend=cpu +#ifndef USE_CPU_ONLY template template deviceStream Gaugefield::iterateOverBulkAllMu(Functor op, bool useStream) { @@ -171,6 +176,7 @@ deviceStream Gaugefield::iterateOve this->template iterateFunctor(op, calcGSiteAllMu, writeAtReadMu, GInd::getLatData().vol4, 4, 1, stream._stream); return stream; } +#endif template template diff --git a/src/gauge/gaugefield_device.cpp b/src/gauge/gaugefield_device.cpp index 3818cfdf..de8e3864 100644 --- a/src/gauge/gaugefield_device.cpp +++ b/src/gauge/gaugefield_device.cpp @@ -16,10 +16,10 @@ struct fill_with_rand GSU3 my_mat; - __host__ __device__ void initialize(__attribute__((unused)) gSite site){ + HOST_DEVICE void initialize(__attribute__((unused)) gSite site){ } - __device__ __host__ GSU3 operator()(gSite site, __attribute__((unused)) size_t mu){ + HOST_DEVICE GSU3 operator()(gSite site, __attribute__((unused)) size_t mu){ my_mat.random(&_rand_state[site.isite]); return my_mat; } @@ -32,10 +32,10 @@ struct fill_with_gauss { GSU3 my_mat; - __host__ __device__ void initialize(__attribute__((unused)) gSite site) { + HOST_DEVICE void initialize(__attribute__((unused)) gSite site) { } - __device__ __host__ GSU3 operator()(gSite site, __attribute__((unused)) size_t mu) { + HOST_DEVICE GSU3 operator()(gSite site, __attribute__((unused)) size_t mu) { my_mat.gauss(&_rand_state[site.isite]); return my_mat; } @@ -45,12 +45,12 @@ struct fill_with_gauss { template struct UnitKernel{ - gaugeAccessor gaugeAccessor; - explicit UnitKernel(Gaugefield& gauge) : gaugeAccessor(gauge.getAccessor()){} - __device__ __host__ GSU3 operator()(gSiteMu siteMu){ + gaugeAccessor gaugeAcc; + explicit UnitKernel(Gaugefield& gauge) : gaugeAcc(gauge.getAccessor()){} + HOST_DEVICE GSU3 operator()(gSiteMu siteMu){ typedef GIndexer GInd; GSU3 temp; - temp=gaugeAccessor.template getLink(siteMu); + temp=gaugeAcc.template getLink(siteMu); temp.su3unitarize(); return temp; } diff --git a/src/modules/HISQ/staggeredPhases.h b/src/modules/HISQ/staggeredPhases.h index 26e78809..adef8f79 100644 --- a/src/modules/HISQ/staggeredPhases.h +++ b/src/modules/HISQ/staggeredPhases.h @@ -8,7 +8,7 @@ struct calcStaggeredPhase { - inline __host__ __device__ int operator()(const gSiteMu &siteMu) const { + inline HOST_DEVICE int operator()(const gSiteMu &siteMu) const { typedef GIndexer GInd; @@ -36,7 +36,7 @@ struct calcStaggeredPhase { * */ struct calcStaggeredBoundary { - inline __host__ __device__ int operator()(const gSiteMu &siteMu) const { + inline HOST_DEVICE int operator()(const gSiteMu &siteMu) const { typedef GIndexer GInd; @@ -51,7 +51,7 @@ struct calcStaggeredBoundary { template struct imagMuphase { - inline __host__ __device__ GPUcomplex operator()(const gSiteMu &siteMu, double chmp) const { + inline HOST_DEVICE GPUcomplex operator()(const gSiteMu &siteMu, double chmp) const { GPUcomplex img_chmp; diff --git a/src/modules/observables/FieldStrengthTensor.h b/src/modules/observables/FieldStrengthTensor.h index fef56346..18acbcb4 100644 --- a/src/modules/observables/FieldStrengthTensor.h +++ b/src/modules/observables/FieldStrengthTensor.h @@ -26,7 +26,7 @@ struct plaqClover { plaqClover(gaugeAccessor acc) : acc(acc) {} - __device__ __host__ inline GSU3 operator()(gSite site, int mu, int nu) { + HOST_DEVICE inline GSU3 operator()(gSite site, int mu, int nu) { return Plaq_P(acc, site, mu, nu) + Plaq_Q(acc, site, mu, nu) @@ -43,7 +43,7 @@ struct rectClover { rectClover(gaugeAccessor acc) : acc(acc) {} - __device__ __host__ inline GSU3 operator()(gSite site, int mu, int nu) { + HOST_DEVICE inline GSU3 operator()(gSite site, int mu, int nu) { gSite origin = site; gSite up = GInd::site_up(site, nu); gSite twoUp = GInd::site_up(up, nu); @@ -154,7 +154,7 @@ struct FieldStrengthTensor { FieldStrengthTensor(gaugeAccessor acc) : acc(acc), plClov(acc) {} - __device__ __host__ inline GSU3 operator()(gSite site, int mu, int nu) { + HOST_DEVICE inline GSU3 operator()(gSite site, int mu, int nu) { //define a unitary matrix for the addition in the end GSU3 unityGSU3 = gsu3_one(); @@ -186,7 +186,7 @@ struct FieldStrengthTensor_imp { FieldStrengthTensor_imp(gaugeAccessor acc) : acc(acc), plClov(acc), rcClov(acc) {} - __device__ __host__ inline GSU3 operator()(gSite site, int mu, int nu) { + HOST_DEVICE inline GSU3 operator()(gSite site, int mu, int nu) { //define a unitary matrix for the addition in the end GSU3 unityGSU3 = gsu3_one(); diff --git a/src/spinor/spinorfield.h b/src/spinor/spinorfield.h index 39c01673..65b71e61 100644 --- a/src/spinor/spinorfield.h +++ b/src/spinor/spinorfield.h @@ -17,18 +17,18 @@ #include template -__host__ __device__ constexpr inline Layout LayoutSwitcher(); +HOST_DEVICE constexpr inline Layout LayoutSwitcher(); template <> -__host__ __device__ constexpr inline Layout LayoutSwitcher() { +HOST_DEVICE constexpr inline Layout LayoutSwitcher() { return All; } template <> -__host__ __device__ constexpr inline Layout LayoutSwitcher() { +HOST_DEVICE constexpr inline Layout LayoutSwitcher() { return Even; } template <> -__host__ __device__ constexpr inline Layout LayoutSwitcher() { +HOST_DEVICE constexpr inline Layout LayoutSwitcher() { return Odd; } @@ -364,7 +364,7 @@ void Spinorfield::iterateOv WriteAtLoopStack writeAtRead; size_t elems = getNumberLatticePoints(); - this->template iterateFunctorLoop(op, calcGSite, writeAtRead, elems, 1 ,1 , (gpuStream_t)nullptr, Nmax); + this->template iterateFunctorLoop(op, calcGSite, writeAtRead, elems, 1 ,1 , (GPUSTREAM_T_)nullptr, Nmax); } template @@ -443,14 +443,14 @@ auto operator / (Spinorfield struct convert_spinor_precision { - __host__ __device__ void initialize(__attribute__((unused)) gSite& site){ + HOST_DEVICE void initialize(__attribute__((unused)) gSite& site){ //We do not initialize anything } gVect3arrayAcc spinor_source; convert_spinor_precision(Spinorfield &spinorIn) : spinor_source(spinorIn.getAccessor()) {} - __device__ __host__ auto operator()(gSiteStack site) { + HOST_DEVICE auto operator()(gSiteStack site) { return spinor_source.template getElement(site); } From 916df86f639f48f11ad6f47e7ba1328da510da5b Mon Sep 17 00:00:00 2001 From: Luis Altenkort Date: Tue, 23 Aug 2022 13:58:28 +0200 Subject: [PATCH 08/14] add missing GIT_HASH definition to CodeBase target --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8f8a122e..8a80cdd8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -313,6 +313,7 @@ if (USE_CUDA) add_library(CodeBase OBJECT ${SOURCE_FILES_BASE}) target_compile_definitions(CodeBase PRIVATE ARCHITECTURE=${ARCHITECTURE} + GIT_HASH="${GIT_HASH}" SINGLEPREC=1 DOUBLEPREC=1 COMP_R18=1 COMP_U3R14=1 COMP_R14=1 COMP_R12=1 COMP_STAGGR12=1 ${ALL_HALODEPTHS} From 5c95722d7636eb173efce54da2888e901d98b696 Mon Sep 17 00:00:00 2001 From: Luis Altenkort Date: Tue, 23 Aug 2022 14:37:52 +0200 Subject: [PATCH 09/14] change "HOST" define to "SQCD_HOST" to avoid syntax errors with MVAPICH --- src/applications/main_configConverter.cpp | 10 ++--- src/base/LatticeDimension.h | 48 +++++++++++------------ src/base/gutils.h | 2 +- src/base/indexer/BulkIndexer.h | 16 ++++---- src/base/math/gcomplex.h | 6 +-- src/base/math/gsu3.h | 14 +++---- src/base/math/gvect3.h | 12 +++--- src/base/math/simpleArray.h | 2 +- src/define.h | 4 +- src/gauge/GaugeAction.cpp | 10 ++--- src/gauge/GaugeAction.h | 10 ++--- 11 files changed, 67 insertions(+), 67 deletions(-) diff --git a/src/applications/main_configConverter.cpp b/src/applications/main_configConverter.cpp index 1628f090..e5790e48 100644 --- a/src/applications/main_configConverter.cpp +++ b/src/applications/main_configConverter.cpp @@ -30,8 +30,8 @@ int main(int argc, char *argv[]) { CommunicationBase commBase(&argc, &argv); param.readfile(commBase, "../parameter/applications/configConverter.param", argc, argv); - if( param.compress_out()==true && param.format_out()=="ildg" ) { - throw(rootLogger.fatal("ILDG format does not support compression.")); + if( param.compress_out() && param.format_out()=="ildg" ) { + throw std::runtime_error(rootLogger.fatal("ILDG format does not support compression.")); } commBase.init(param.nodeDim()); @@ -47,7 +47,7 @@ int main(int argc, char *argv[]) { } else if(param.format()=="milc"){ gauge.readconf_milc(param.GaugefileName()); } else { - throw(rootLogger.fatal("Invalid specification for format ",param.format())); + throw std::runtime_error(rootLogger.fatal("Invalid specification for format ",param.format())); } /// Print out: @@ -58,9 +58,9 @@ int main(int argc, char *argv[]) { gauge.writeconf_nersc(param.GaugefileName_out(), 3, param.prec_out()); } } else if(param.format_out()=="ildg") { - gauge.writeconf_ildg(param.GaugefileName_out(), param.prec_out()); + gauge.writeconf_ildg(param.GaugefileName_out(), param); } else { - throw(rootLogger.fatal("Invalid specification for format_out ",param.format_out())); + throw std::runtime_error(rootLogger.fatal("Invalid specification for format_out ",param.format_out())); } return 0; diff --git a/src/base/LatticeDimension.h b/src/base/LatticeDimension.h index 59a9e16a..b5148149 100644 --- a/src/base/LatticeDimension.h +++ b/src/base/LatticeDimension.h @@ -20,14 +20,14 @@ private : public : //! Copy constructor - HOST LatticeDimensions(const LatticeDimensions &lhs) { + SQCD_HOST LatticeDimensions(const LatticeDimensions &lhs) { for (int i = 0; i < 4; i++) c[i] = lhs.c[i]; } LatticeDimensions& operator=(const LatticeDimensions& a) = default; //! Default constructor, initializes to (0,0,0,0) - HOST LatticeDimensions() { + SQCD_HOST LatticeDimensions() { c[0] = 0; c[1] = 0; c[2] = 0; @@ -35,7 +35,7 @@ public : } //! Construct from (x,y,z,t) - HOST LatticeDimensions(const int x, const int y, const int z, const int t) { + SQCD_HOST LatticeDimensions(const int x, const int y, const int z, const int t) { c[0] = x; c[1] = y; c[2] = z; @@ -43,39 +43,39 @@ public : } //! Construct from int* (also works with Parameter) - HOST LatticeDimensions(const int *dim) { + SQCD_HOST LatticeDimensions(const int *dim) { for (int i = 0; i < 4; i++) c[i] = dim[i]; } //! Cast to int* (for usage in c-style MPI functions) - HOST operator int *() { return c; } + SQCD_HOST operator int *() { return c; } //! same with const - HOST operator const int *() const { return c; }; + SQCD_HOST operator const int *() const { return c; }; //! [] operator for member access - HOST int &operator[](int mu) { return c[mu]; }; + SQCD_HOST int &operator[](int mu) { return c[mu]; }; //! const [] operator for r/o member access - HOST const int &operator[](int mu) const { return c[mu]; }; + SQCD_HOST const int &operator[](int mu) const { return c[mu]; }; //! Component-wise multiplication, (x1*x2, y1*y2, z1*z2, t1*t2) - HOST LatticeDimensions operator*(const LatticeDimensions lhs) const { + SQCD_HOST LatticeDimensions operator*(const LatticeDimensions lhs) const { LatticeDimensions ret; for (int i = 0; i < 4; i++) ret.c[i] = c[i] * lhs.c[i]; return ret; } //! Component-wise division, (x1/x2, y1/y2, z1/z2, t1/t2) - HOST LatticeDimensions operator/(const LatticeDimensions lhs) const { + SQCD_HOST LatticeDimensions operator/(const LatticeDimensions lhs) const { LatticeDimensions ret; for (int i = 0; i < 4; i++) ret.c[i] = c[i] / lhs.c[i]; return ret; } //! modulo operation that returns coordinates within 0<=x= 4)) throw std::runtime_error(stdLogger.fatal("Wrong mu in LatticeDimensions")); c[mu] += ((plus) ? (1) : (-1)); } //! Formatted (debug) output - HOST friend std::ostream &operator<<(std::ostream &str, + SQCD_HOST friend std::ostream &operator<<(std::ostream &str, const LatticeDimensions &in) { str << "( "; for (int i = 0; i < 4; i++) str << in.c[i] << " "; @@ -125,13 +125,13 @@ public : //! Return all four entries multiplied - HOST long mult() const { + SQCD_HOST long mult() const { long res = 1; for (int i = 0; i < 4; i++) res *= (long) c[i]; return res; } - HOST long summed() const { + SQCD_HOST long summed() const { long res = 0; for (int i = 0; i < 4; i++) res += (long) c[i]; return res; @@ -139,11 +139,11 @@ public : //! Return if x,y,z,t are 0<=x= c[i])) return false; return true; @@ -151,13 +151,13 @@ public : //! Return an offset matching given coordinates. With input x,y,z,t //! this returns x + y*LX + z*LX*LY + t*LX*LY*LZ - HOST size_t offset(const LatticeDimensions &in) const { + SQCD_HOST size_t offset(const LatticeDimensions &in) const { size_t ret = in[0] + c[0] * in[1] + c[0] * c[1] * in[2] + c[0] * c[1] * c[2] * in[3]; return ret; } //! Return the lowest entry - HOST int lowest_value() const { + SQCD_HOST int lowest_value() const { int res = c[1]; for (int i = 0; i < 4; i++) if (c[i] < res)res = c[i]; @@ -165,7 +165,7 @@ public : } //! Return the lowest entry - HOST int lowest_spatial_value() const { + SQCD_HOST int lowest_spatial_value() const { int res = c[1]; for (int i = 0; i < 3; i++) if (c[i] < res)res = c[i]; diff --git a/src/base/gutils.h b/src/base/gutils.h index 13cd939b..7102b9d0 100644 --- a/src/base/gutils.h +++ b/src/base/gutils.h @@ -27,7 +27,7 @@ HOST_DEVICE void inline divmod(size_t nominator, size_t denominator, remainder = nominator - (quotient * denominator); } -HOST void inline compute_dim3(dim3 &blockDim, dim3 &gridDim, +SQCD_HOST void inline compute_dim3(dim3 &blockDim, dim3 &gridDim, const size_t elems, const size_t blockSize) { blockDim = blockSize; gridDim = static_cast(ceilf(static_cast(elems) / static_cast(blockDim.x))); diff --git a/src/base/indexer/BulkIndexer.h b/src/base/indexer/BulkIndexer.h index c9f32ccf..ec4105d3 100644 --- a/src/base/indexer/BulkIndexer.h +++ b/src/base/indexer/BulkIndexer.h @@ -64,14 +64,14 @@ struct gSite { isite(isite), isiteFull(isiteFull), coord(coord), coordFull(coordFull) {}; - HOST friend inline std::ostream &operator << (std::ostream &s, const gSite &site) { + SQCD_HOST friend inline std::ostream &operator << (std::ostream &s, const gSite &site) { s << "gSite: coord: " << site.coord.x << " " << site.coord.y << " " << site.coord.z << " " << site.coord.t << " " << "coordFull: " << site.coordFull.x << " " << site.coordFull.y << " " << site.coordFull.z << " " << site.coordFull.t << " " << "isite: " << site.isite << " isiteFull: " << site.isiteFull; return s; } - HOST inline std::string getStr() { + SQCD_HOST inline std::string getStr() { std::ostringstream s; s << "gSite: coord: " << coord.x << " " << coord.y << " " << coord.z << " " << coord.t << " " << "coordFull: " << coordFull.x << " " << coordFull.y << " " << coordFull.z << " " << coordFull.t << " " @@ -96,7 +96,7 @@ struct gSiteStack : public gSite { gSiteStack(const gSite) = delete; - HOST friend inline std::ostream &operator << (std::ostream &s, const gSiteStack &site) { + SQCD_HOST friend inline std::ostream &operator << (std::ostream &s, const gSiteStack &site) { s << "gSiteStack: coord: " << site.coord.x << " " << site.coord.y << " " << site.coord.z << " " << site.coord.t << " " << " coordFull: " << site.coordFull.x << " " << site.coordFull.y << " " << site.coordFull.z << " " << site.coordFull.t << " " << " isite: " << site.isite << " isiteFull: " << site.isiteFull << " stack: " << site.stack @@ -122,7 +122,7 @@ struct gSiteMu : public gSite { gSiteMu(const gSite) = delete; gSiteMu(const gSiteStack) = delete; - HOST friend inline std::ostream &operator << (std::ostream &s, const gSiteMu &site) { + SQCD_HOST friend inline std::ostream &operator << (std::ostream &s, const gSiteMu &site) { s << "gSite: coord: " << site.coord.x << " " << site.coord.y << " " << site.coord.z << " " << site.coord.t << " " << "coordFull: " << site.coordFull.x << " " << site.coordFull.y << " " << site.coordFull.z << " " << site.coordFull.t << " " << "isite: " << site.isite @@ -265,7 +265,7 @@ struct LatticeData { return false; } - HOST LatticeDimensions globalPos(LatticeDimensions n) { + SQCD_HOST LatticeDimensions globalPos(LatticeDimensions n) { LatticeDimensions coord = LatticeDimensions(gPosX,gPosY,gPosZ,gPosT) + n; @@ -277,11 +277,11 @@ struct LatticeData { return coord; } - HOST LatticeDimensions globalLattice() { + SQCD_HOST LatticeDimensions globalLattice() { return LatticeDimensions(globLX,globLY,globLZ,globLT); } - HOST LatticeDimensions localLattice() { + SQCD_HOST LatticeDimensions localLattice() { return LatticeDimensions(lx,ly,lz,lt); } @@ -704,7 +704,7 @@ class GIndexer { return ((coordFull.x + coordFull.y*getLatData().vol1Full + coordFull.z*getLatData().vol2Full) >> 0x1); } - HOST inline static size_t localCoordToGlobalIndex(LatticeDimensions coord) { + SQCD_HOST inline static size_t localCoordToGlobalIndex(LatticeDimensions coord) { LatticeData lat = GIndexer::getLatData(); LatticeDimensions globCoord = lat.globalPos(coord); return (globCoord[0] + globCoord[1] * lat.globLX + globCoord[2] * lat.globLX * lat.globLY + diff --git a/src/base/math/gcomplex.h b/src/base/math/gcomplex.h index 71c40a8f..aad25f6c 100644 --- a/src/base/math/gcomplex.h +++ b/src/base/math/gcomplex.h @@ -76,11 +76,11 @@ class GPUcomplex { cIMAG = 0.0f; }; - HOST GPUcomplex(const std::complex &orig) { + SQCD_HOST GPUcomplex(const std::complex &orig) { cREAL = std::real(orig); cIMAG = std::imag(orig); } - HOST GPUcomplex(const std::complex &orig) { + SQCD_HOST GPUcomplex(const std::complex &orig) { cREAL = std::real(orig); cIMAG = std::imag(orig); } @@ -527,7 +527,7 @@ template const GPUcomplex GPUcomplex_invalid(nanf(" "), nanf(" ")); template -HOST inline std::ostream &operator<<(std::ostream &s, +SQCD_HOST inline std::ostream &operator<<(std::ostream &s, GPUcomplex z) { return s << '(' << real(z) << ',' << imag(z) << ')'; } diff --git a/src/base/math/gsu3.h b/src/base/math/gsu3.h index 2e78ff4c..4c23728f 100644 --- a/src/base/math/gsu3.h +++ b/src/base/math/gsu3.h @@ -24,10 +24,10 @@ template class GSU3; template -HOST std::ostream &operator<<(std::ostream &, const GSU3 &); +SQCD_HOST std::ostream &operator<<(std::ostream &, const GSU3 &); template -HOST std::istream &operator>>(std::istream &, GSU3 &); +SQCD_HOST std::istream &operator>>(std::istream &, GSU3 &); template HOST_DEVICE inline GSU3 operator+(const GSU3 &, const GSU3 &); @@ -126,10 +126,10 @@ class GSU3 { #if (!defined __GPUCC__) - HOST friend std::ostream& operator<< <> (std::ostream&, const GSU3 &); + SQCD_HOST friend std::ostream& operator<< <> (std::ostream&, const GSU3 &); #endif - HOST friend std::istream &operator>><>(std::istream &, GSU3 &); + SQCD_HOST friend std::istream &operator>><>(std::istream &, GSU3 &); // matrix operations @@ -392,7 +392,7 @@ class GSU3 { return _e00; } - HOST inline const GCOMPLEX(floatT) &operator()(int i, int j) const { + SQCD_HOST inline const GCOMPLEX(floatT) &operator()(int i, int j) const { switch (i * 3 + j) { case 0: return _e00; @@ -792,7 +792,7 @@ HOST_DEVICE bool GSU3::operator==(const GSU3 &y) { } template -HOST inline std::ostream &operator<<(std::ostream &s, const GSU3 &x) { +SQCD_HOST inline std::ostream &operator<<(std::ostream &s, const GSU3 &x) { return s << "\n" << x.getLink00() << x.getLink01() << x.getLink02() << "\n" << x.getLink10() << x.getLink11() << x.getLink12() << "\n" << x.getLink20() << x.getLink21() << x.getLink22() << "\n"; @@ -800,7 +800,7 @@ return s << "\n" << x.getLink00() << x.getLink01() << x.getLink02() << "\n" /// TODO: This presumably doesn't work template -HOST inline std::istream &operator>>(std::istream &s, GSU3 &x) { +SQCD_HOST inline std::istream &operator>>(std::istream &s, GSU3 &x) { return s >> x._e00.cREAL >> x._e00.cIMAG >> x._e01.cREAL >> x._e01.cIMAG >> x._e02.cREAL >> x._e02.cIMAG >> x._e10.cREAL >> x._e10.cIMAG >> x._e11.cREAL >> x._e11.cIMAG >> x._e12.cREAL >> x._e12.cIMAG >> x._e20.cREAL >> x._e20.cIMAG >> x._e21.cREAL >> x._e21.cIMAG >> x._e22.cREAL >> x._e22.cIMAG; diff --git a/src/base/math/gvect3.h b/src/base/math/gvect3.h index b5e2e215..e4bca291 100644 --- a/src/base/math/gvect3.h +++ b/src/base/math/gvect3.h @@ -19,8 +19,8 @@ template class GSU3; template class gVect3; template class cVect3; template class gVect3array; -template HOST std::ostream & operator<<(std::ostream &, const gVect3 &); -template HOST std::istream & operator>>(std::istream &, gVect3 &); +template SQCD_HOST std::ostream & operator<<(std::ostream &, const gVect3 &); +template SQCD_HOST std::istream & operator>>(std::istream &, gVect3 &); template HOST_DEVICE GCOMPLEX(floatT) operator*(const gVect3 &,const gVect3 &); template HOST_DEVICE GCOMPLEX(floatT) complex_product(const gVect3 &,const gVect3 &); template HOST_DEVICE GCOMPLEX(floatT) complex_product_add(const gVect3 &,const gVect3 &, const GCOMPLEX(floatT) &); @@ -58,9 +58,9 @@ class gVect3 HOST_DEVICE gVect3(GCOMPLEX(floatT) v0, GCOMPLEX(floatT) v1, GCOMPLEX(floatT) v2) : _v0(v0), _v1(v1), _v2(v2) {}; #if (!defined __GPUCC__) - HOST friend std::ostream &operator << <> (std::ostream &, const gVect3 &); + SQCD_HOST friend std::ostream &operator << <> (std::ostream &, const gVect3 &); #endif - HOST friend std::istream &operator >> <> (std::istream &, gVect3 &); + SQCD_HOST friend std::istream &operator >> <> (std::istream &, gVect3 &); friend class gVect3array; friend class gVect3array; @@ -459,13 +459,13 @@ HOST_DEVICE gVect3 conj(const gVect3 &x) #ifdef __GPUCC__ template -HOST std::ostream &operator << (std::ostream &s, const gVect3 &x) +SQCD_HOST std::ostream &operator << (std::ostream &s, const gVect3 &x) { return s << x.getElement0() << x.getElement1() << x.getElement2(); } template -HOST std::istream &operator >> (std::istream &s, gVect3 &x) +SQCD_HOST std::istream &operator >> (std::istream &s, gVect3 &x) { return s >> x._v0.cREAL >> x._v0.cIMAG >> x._v1.cREAL >> x._v1.cIMAG >> x._v2.cREAL >> x._v2.cIMAG; } diff --git a/src/base/math/simpleArray.h b/src/base/math/simpleArray.h index 05555fef..425fc562 100644 --- a/src/base/math/simpleArray.h +++ b/src/base/math/simpleArray.h @@ -50,7 +50,7 @@ class SimpleArray{ } - HOST void operator=(std::vector vec){ + SQCD_HOST void operator=(std::vector vec){ for(size_t i = 0; i < N; i++){ values[i] = vec.at(i); } diff --git a/src/define.h b/src/define.h index 6f8fa743..e4a3d7e1 100644 --- a/src/define.h +++ b/src/define.h @@ -30,7 +30,7 @@ #ifdef USE_CPU_ONLY #define HOST_DEVICE -#define HOST +#define SQCD_HOST #define DEVICE #define CONSTANT const #define GPUERROR_T void* @@ -56,7 +56,7 @@ struct dim3 { }; #else #define HOST_DEVICE __host__ __device__ -#define HOST __host__ +#define SQCD_HOST __host__ #define DEVICE __device__ #define CONSTANT __constant__ #define GPUERROR_T gpuError_t diff --git a/src/gauge/GaugeAction.cpp b/src/gauge/GaugeAction.cpp index c9b03221..fd92f9d2 100644 --- a/src/gauge/GaugeAction.cpp +++ b/src/gauge/GaugeAction.cpp @@ -65,7 +65,7 @@ MemoryAccessor GaugeAction::getRectangleField( template -HOST floatT GaugeAction::barePlaquette() { +SQCD_HOST floatT GaugeAction::barePlaquette() { if (recompute) { _redBase.template iterateOverBulk( @@ -77,7 +77,7 @@ HOST floatT GaugeAction::barePlaquette() { } template -HOST floatT GaugeAction::barePlaquetteSS() { +SQCD_HOST floatT GaugeAction::barePlaquetteSS() { // if (recompute) { _redBase.template iterateOverBulk( @@ -90,7 +90,7 @@ HOST floatT GaugeAction::barePlaquetteSS() { template -HOST floatT GaugeAction::bareUtauMinusUsigma() { +SQCD_HOST floatT GaugeAction::bareUtauMinusUsigma() { if (recompute) { _redBase.template iterateOverBulk( @@ -103,7 +103,7 @@ HOST floatT GaugeAction::bareUtauMinusUsigma() template -HOST floatT GaugeAction::bareClover() { +SQCD_HOST floatT GaugeAction::bareClover() { if (recompute) { _redBase.template iterateOverBulk( @@ -115,7 +115,7 @@ HOST floatT GaugeAction::bareClover() { } template -HOST floatT GaugeAction::bareRectangle() { +SQCD_HOST floatT GaugeAction::bareRectangle() { if (recompute) { _redBase.template iterateOverBulk( diff --git a/src/gauge/GaugeAction.h b/src/gauge/GaugeAction.h index ba5dab23..d544de87 100644 --- a/src/gauge/GaugeAction.h +++ b/src/gauge/GaugeAction.h @@ -24,12 +24,12 @@ class GaugeAction { template MemoryAccessor getField(); - HOST floatT barePlaquette(); - HOST floatT bareUtauMinusUsigma(); - HOST floatT bareClover(); - HOST floatT bareRectangle(); + SQCD_HOST floatT barePlaquette(); + SQCD_HOST floatT bareUtauMinusUsigma(); + SQCD_HOST floatT bareClover(); + SQCD_HOST floatT bareRectangle(); - HOST floatT barePlaquetteSS(); + SQCD_HOST floatT barePlaquetteSS(); public: From caa7602bf642521c9b4db4e7184567474d57104e Mon Sep 17 00:00:00 2001 From: Luis Altenkort Date: Wed, 24 Aug 2022 10:32:07 +0200 Subject: [PATCH 10/14] add missing include in floatComparison.h --- src/base/math/floatComparison.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/base/math/floatComparison.h b/src/base/math/floatComparison.h index 69a3703a..4233bfdf 100644 --- a/src/base/math/floatComparison.h +++ b/src/base/math/floatComparison.h @@ -8,6 +8,7 @@ #include #include #include "../wrapper/gpu_wrapper.h" +#include "../../define.h" From 28619b200c0a8351021f3faf780b004d0c9c052f Mon Sep 17 00:00:00 2001 From: Luis Altenkort Date: Tue, 30 Aug 2022 18:13:23 +0200 Subject: [PATCH 11/14] use more clever way to get rid of "__host__" and "__device__" --- src/applications/main_CheckConf.cpp | 2 +- src/base/LatticeContainer.h | 14 +- src/base/LatticeDimension.h | 48 +-- src/base/communication/HaloLoop.h | 4 +- .../communication/calcGSiteHalo_dynamic.h | 20 +- src/base/communication/siteComm.h | 4 +- src/base/gutils.h | 8 +- src/base/indexer/BulkIndexer.h | 258 +++++++-------- src/base/indexer/HaloIndexer.h | 152 ++++----- src/base/math/correlators.h | 58 ++-- src/base/math/floatComparison.h | 4 +- src/base/math/gaugeAccessor.h | 28 +- src/base/math/gaugeConstructor.h | 52 +-- src/base/math/gcomplex.h | 160 ++++----- src/base/math/generalAccessor.h | 6 +- src/base/math/grnd.cpp | 2 +- src/base/math/grnd.h | 16 +- src/base/math/gsu2.h | 62 ++-- src/base/math/gsu3.h | 308 +++++++++--------- src/base/math/gvect3.h | 178 +++++----- src/base/math/gvect3array.h | 22 +- src/base/math/matrix4x4.h | 22 +- src/base/math/operators.h | 26 +- src/base/math/simpleArray.h | 32 +- src/base/math/su3Exp.h | 4 +- src/base/memoryManagement.h | 4 +- src/base/runFunctors.h | 52 +-- src/define.h | 11 +- src/explicit_instantiation_macros.h | 2 +- src/gauge/GaugeAction.cpp | 10 +- src/gauge/GaugeAction.h | 10 +- src/gauge/constructs/PlaqConstructs.h | 8 +- src/gauge/constructs/derivative3link.h | 2 +- src/gauge/constructs/derivative5link.h | 42 +-- src/gauge/constructs/derivative7link.h | 2 +- src/gauge/constructs/derivativeLepagelink.h | 2 +- .../derivativeProjectU3Constructs.h | 2 +- src/gauge/constructs/fat7LinkConstructs.h | 10 +- src/gauge/constructs/gsvd.h | 4 +- src/gauge/constructs/hisqForceConstructs.h | 36 +- src/gauge/constructs/linkLepageConstructs.h | 4 +- src/gauge/constructs/linkStaple3Constructs.h | 4 +- src/gauge/constructs/linkStaple5Constructs.h | 2 +- src/gauge/constructs/linkStaple7Constructs.h | 2 +- src/gauge/constructs/naikConstructs.h | 2 +- .../constructs/naikDerivativeConstructs.h | 2 +- src/gauge/constructs/projectU3Constructs.h | 2 +- src/gauge/gaugeActionDeriv.h | 8 +- src/gauge/gauge_kernels.cpp | 18 +- src/gauge/gaugefield.h | 2 +- src/gauge/gaugefield_device.cpp | 10 +- src/modules/HISQ/staggeredPhases.h | 6 +- src/modules/observables/FieldStrengthTensor.h | 8 +- src/spinor/spinorfield.h | 12 +- 54 files changed, 882 insertions(+), 887 deletions(-) diff --git a/src/applications/main_CheckConf.cpp b/src/applications/main_CheckConf.cpp index 8c2655ba..246cada4 100644 --- a/src/applications/main_CheckConf.cpp +++ b/src/applications/main_CheckConf.cpp @@ -13,7 +13,7 @@ struct do_check_unitarity { explicit do_check_unitarity(Gaugefield &gauge) : gAcc(gauge.getAccessor()) {}; gaugeAccessor gAcc; - HOST_DEVICE floatT operator()(gSite site){ + __host__ __device__ floatT operator()(gSite site){ typedef GIndexer GInd; floatT ret=0.0; for (size_t mu = 0; mu < 4; ++mu) diff --git a/src/base/LatticeContainer.h b/src/base/LatticeContainer.h index 013b9904..0c953eed 100644 --- a/src/base/LatticeContainer.h +++ b/src/base/LatticeContainer.h @@ -47,30 +47,30 @@ class LatticeContainerAccessor : public MemoryAccessor { /// Set values. template - HOST_DEVICE inline void setElement(const size_t isite, const floatT value) { + __host__ __device__ inline void setElement(const size_t isite, const floatT value) { auto *arr = reinterpret_cast(Array); arr[isite] = value; } template - HOST_DEVICE inline void setElement(const gSite& site, const floatT value) { + __host__ __device__ inline void setElement(const gSite& site, const floatT value) { setValue(site.isite, value); } template - HOST_DEVICE inline void setElement(const gSiteStack& site, const floatT value) { + __host__ __device__ inline void setElement(const gSiteStack& site, const floatT value) { setValue(site.isiteStack, value); } /// Get values. template - HOST_DEVICE floatT getElement(const gSite& site) { + __host__ __device__ floatT getElement(const gSite& site) { return getElement(site.isite); } template - HOST_DEVICE floatT getElement(const gSiteStack& site) { + __host__ __device__ floatT getElement(const gSiteStack& site) { return getElement(site.isiteStack); } template - HOST_DEVICE inline floatT getElement(const size_t isite) { + __host__ __device__ inline floatT getElement(const size_t isite) { auto *arr = reinterpret_cast(Array); return arr[isite]; } @@ -394,7 +394,7 @@ void LatticeContainer::iterateOverBulkStacked(Functor op) { template struct WriteAtTimeSlices { - inline HOST_DEVICE size_t operator()(const gSite &site) { + inline __host__ __device__ size_t operator()(const gSite &site) { return GIndexer::siteTimeOrdered(site); } }; diff --git a/src/base/LatticeDimension.h b/src/base/LatticeDimension.h index b5148149..92d7b6f3 100644 --- a/src/base/LatticeDimension.h +++ b/src/base/LatticeDimension.h @@ -20,14 +20,14 @@ private : public : //! Copy constructor - SQCD_HOST LatticeDimensions(const LatticeDimensions &lhs) { + __host__ LatticeDimensions(const LatticeDimensions &lhs) { for (int i = 0; i < 4; i++) c[i] = lhs.c[i]; } LatticeDimensions& operator=(const LatticeDimensions& a) = default; //! Default constructor, initializes to (0,0,0,0) - SQCD_HOST LatticeDimensions() { + __host__ LatticeDimensions() { c[0] = 0; c[1] = 0; c[2] = 0; @@ -35,7 +35,7 @@ public : } //! Construct from (x,y,z,t) - SQCD_HOST LatticeDimensions(const int x, const int y, const int z, const int t) { + __host__ LatticeDimensions(const int x, const int y, const int z, const int t) { c[0] = x; c[1] = y; c[2] = z; @@ -43,39 +43,39 @@ public : } //! Construct from int* (also works with Parameter) - SQCD_HOST LatticeDimensions(const int *dim) { + __host__ LatticeDimensions(const int *dim) { for (int i = 0; i < 4; i++) c[i] = dim[i]; } //! Cast to int* (for usage in c-style MPI functions) - SQCD_HOST operator int *() { return c; } + __host__ operator int *() { return c; } //! same with const - SQCD_HOST operator const int *() const { return c; }; + __host__ operator const int *() const { return c; }; //! [] operator for member access - SQCD_HOST int &operator[](int mu) { return c[mu]; }; + __host__ int &operator[](int mu) { return c[mu]; }; //! const [] operator for r/o member access - SQCD_HOST const int &operator[](int mu) const { return c[mu]; }; + __host__ const int &operator[](int mu) const { return c[mu]; }; //! Component-wise multiplication, (x1*x2, y1*y2, z1*z2, t1*t2) - SQCD_HOST LatticeDimensions operator*(const LatticeDimensions lhs) const { + __host__ LatticeDimensions operator*(const LatticeDimensions lhs) const { LatticeDimensions ret; for (int i = 0; i < 4; i++) ret.c[i] = c[i] * lhs.c[i]; return ret; } //! Component-wise division, (x1/x2, y1/y2, z1/z2, t1/t2) - SQCD_HOST LatticeDimensions operator/(const LatticeDimensions lhs) const { + __host__ LatticeDimensions operator/(const LatticeDimensions lhs) const { LatticeDimensions ret; for (int i = 0; i < 4; i++) ret.c[i] = c[i] / lhs.c[i]; return ret; } //! modulo operation that returns coordinates within 0<=x= 4)) throw std::runtime_error(stdLogger.fatal("Wrong mu in LatticeDimensions")); c[mu] += ((plus) ? (1) : (-1)); } //! Formatted (debug) output - SQCD_HOST friend std::ostream &operator<<(std::ostream &str, + __host__ friend std::ostream &operator<<(std::ostream &str, const LatticeDimensions &in) { str << "( "; for (int i = 0; i < 4; i++) str << in.c[i] << " "; @@ -125,13 +125,13 @@ public : //! Return all four entries multiplied - SQCD_HOST long mult() const { + __host__ long mult() const { long res = 1; for (int i = 0; i < 4; i++) res *= (long) c[i]; return res; } - SQCD_HOST long summed() const { + __host__ long summed() const { long res = 0; for (int i = 0; i < 4; i++) res += (long) c[i]; return res; @@ -139,11 +139,11 @@ public : //! Return if x,y,z,t are 0<=x= c[i])) return false; return true; @@ -151,13 +151,13 @@ public : //! Return an offset matching given coordinates. With input x,y,z,t //! this returns x + y*LX + z*LX*LY + t*LX*LY*LZ - SQCD_HOST size_t offset(const LatticeDimensions &in) const { + __host__ size_t offset(const LatticeDimensions &in) const { size_t ret = in[0] + c[0] * in[1] + c[0] * c[1] * in[2] + c[0] * c[1] * c[2] * in[3]; return ret; } //! Return the lowest entry - SQCD_HOST int lowest_value() const { + __host__ int lowest_value() const { int res = c[1]; for (int i = 0; i < 4; i++) if (c[i] < res)res = c[i]; @@ -165,7 +165,7 @@ public : } //! Return the lowest entry - SQCD_HOST int lowest_spatial_value() const { + __host__ int lowest_spatial_value() const { int res = c[1]; for (int i = 0; i < 3; i++) if (c[i] < res)res = c[i]; diff --git a/src/base/communication/HaloLoop.h b/src/base/communication/HaloLoop.h index 3bd86e9a..49ef18bc 100644 --- a/src/base/communication/HaloLoop.h +++ b/src/base/communication/HaloLoop.h @@ -32,7 +32,7 @@ struct ExtractInnerHaloSeg { ExtractInnerHaloSeg(Accessor acc, Accessor hal_acc) : _acc(acc), _hal_acc(hal_acc) {} - inline HOST_DEVICE void operator()(HaloSite site) { + inline __host__ __device__ void operator()(HaloSite site) { for (size_t mu = 0; mu < ElemCount; mu++) { size_t index = _acc.template getIndexComm(site.LatticeIndex, mu); @@ -148,7 +148,7 @@ struct InjectOuterHaloSeg { _acc(acc), _hal_acc(hal_acc) { } - inline HOST_DEVICE void operator()(HaloSite site) { + inline __host__ __device__ void operator()(HaloSite site) { for (size_t mu = 0; mu < ElemCount; mu++) { size_t index = _acc.template getIndexComm(site.LatticeIndex, mu); diff --git a/src/base/communication/calcGSiteHalo_dynamic.h b/src/base/communication/calcGSiteHalo_dynamic.h index 9b62e371..2d39053c 100644 --- a/src/base/communication/calcGSiteHalo_dynamic.h +++ b/src/base/communication/calcGSiteHalo_dynamic.h @@ -27,7 +27,7 @@ struct CalcOuterHaloIndexComm { typedef HaloIndexer HInd; typedef GIndexer GInd; - inline HOST_DEVICE HaloSite + inline __host__ __device__ HaloSite operator()(const dim3 &blockDim, const uint3 &blockIdx, const uint3 &threadIdx) { HaloSite site; @@ -46,7 +46,7 @@ struct CalcInnerHaloIndexComm { typedef HaloIndexer HInd; typedef GIndexer GInd; - inline HOST_DEVICE HaloSite + inline __host__ __device__ HaloSite operator()(const dim3 &blockDim, const uint3 &blockIdx, const uint3 &threadIdx) { HaloSite site; @@ -69,7 +69,7 @@ struct CalcOuterHaloSegCoord{ CalcOuterHaloSegCoord(HaloSegment hseg, short leftRight) : hseg(hseg), leftRight(leftRight){} - inline HOST_DEVICE sitexyzt + inline __host__ __device__ sitexyzt operator()(size_t LocHalIndex) { sitexyzt coord(0, 0, 0, 0); @@ -102,7 +102,7 @@ struct CalcOuterHaloSegIndexComm{ CalcOuterHaloSegIndexComm(HaloSegment hseg, short leftRight) : calcSegCoord(hseg,leftRight){} - inline HOST_DEVICE HaloSite + inline __host__ __device__ HaloSite operator()(const dim3 &blockDim, const uint3 &blockIdx, const uint3 &threadIdx) { HaloSite site; @@ -125,7 +125,7 @@ struct CalcInnerSegCoord{ CalcInnerSegCoord(HaloSegment hseg, short leftRight) : hseg(hseg), leftRight(leftRight){} - inline HOST_DEVICE sitexyzt + inline __host__ __device__ sitexyzt operator()(size_t LocHalIndex) { sitexyzt coord(0, 0, 0, 0); @@ -160,7 +160,7 @@ struct CalcInnerHaloSegCoord{ CalcInnerHaloSegCoord(HaloSegment hseg, short leftRight) : hseg(hseg), leftRight(leftRight){} - inline HOST_DEVICE sitexyzt + inline __host__ __device__ sitexyzt operator()(size_t LocHalIndex) { sitexyzt coord(0, 0, 0, 0); @@ -192,7 +192,7 @@ struct CalcInnerHaloSegIndexComm{ CalcInnerHaloSegIndexComm(HaloSegment hseg, short leftRight) : calcSegCoord(hseg,leftRight){} - inline HOST_DEVICE HaloSite + inline __host__ __device__ HaloSite operator()(const dim3 &blockDim, const uint3 &blockIdx, const uint3 &threadIdx) { HaloSite site; @@ -214,7 +214,7 @@ struct CalcGSiteHaloSeg { CalcGSiteHaloSeg(CalcIndexOp calcIndexOp, HaloSegment hseg, short leftRight) : calcIndexOp(calcIndexOp), calcSegCoord(hseg,leftRight) { } - inline HOST_DEVICE auto + inline __host__ __device__ auto operator()(size_t HaloIndex, size_t mu) { sitexyzt coord = calcSegCoord(HaloIndex); @@ -234,7 +234,7 @@ struct CalcGSiteInnerHalo { CalcGSiteInnerHalo(CalcIndexOp calcIndexOp) : calcIndexOp(calcIndexOp) { } - inline HOST_DEVICE auto + inline __host__ __device__ auto operator()(size_t HaloIndex, size_t mu) { sitexyzt coord = HInd::getInnerCoord(HaloIndex); auto site = calcIndexOp(GInd::getSite(coord.x, coord.y, coord.z, coord.t), mu); @@ -252,7 +252,7 @@ struct CalcGSiteCenter { CalcGSiteCenter(CalcIndexOp calcIndexOp) : calcIndexOp(calcIndexOp) { } - inline HOST_DEVICE auto + inline __host__ __device__ auto operator()(size_t HaloIndex, size_t mu) { sitexyzt coord = HInd::getCenterCoord(HaloIndex); auto site = calcIndexOp(GInd::getSite(coord.x, coord.y, coord.z, coord.t), mu); diff --git a/src/base/communication/siteComm.h b/src/base/communication/siteComm.h index 7b4a3b48..7b028749 100644 --- a/src/base/communication/siteComm.h +++ b/src/base/communication/siteComm.h @@ -310,7 +310,7 @@ struct ExtractInnerHalo { } } - inline HOST_DEVICE void operator()(HaloSite site) { + inline __host__ __device__ void operator()(HaloSite site) { Accessor _hal_acc(pointer[site.HalNumber], size[site.HalNumber]); @@ -354,7 +354,7 @@ struct InjectOuterHalo { } } - inline HOST_DEVICE void operator()(HaloSite site) { + inline __host__ __device__ void operator()(HaloSite site) { Accessor _hal_acc(pointer[site.HalNumber], size[site.HalNumber]); for (size_t mu = 0; mu < ElemCount; mu++) { diff --git a/src/base/gutils.h b/src/base/gutils.h index 7102b9d0..28385acf 100644 --- a/src/base/gutils.h +++ b/src/base/gutils.h @@ -16,18 +16,18 @@ * Utility function to calculate quotient and remainder of * nominator / denominator. */ -HOST_DEVICE void inline divmod(int nominator, int denominator, +__host__ __device__ void inline divmod(int nominator, int denominator, int "ient, int &remainder) { quotient = nominator / denominator; remainder = nominator - (quotient * denominator); } -HOST_DEVICE void inline divmod(size_t nominator, size_t denominator, +__host__ __device__ void inline divmod(size_t nominator, size_t denominator, size_t "ient, size_t &remainder) { quotient = nominator / denominator; remainder = nominator - (quotient * denominator); } -SQCD_HOST void inline compute_dim3(dim3 &blockDim, dim3 &gridDim, +__host__ void inline compute_dim3(dim3 &blockDim, dim3 &gridDim, const size_t elems, const size_t blockSize) { blockDim = blockSize; gridDim = static_cast(ceilf(static_cast(elems) / static_cast(blockDim.x))); @@ -54,7 +54,7 @@ class GpuError { /** * Utility method for speedy testing of whether a number is odd */ -HOST_DEVICE inline bool isOdd(int cand) { return (cand & 0x1); } +__host__ __device__ inline bool isOdd(int cand) { return (cand & 0x1); } #endif /* UTIL_H */ diff --git a/src/base/indexer/BulkIndexer.h b/src/base/indexer/BulkIndexer.h index ec4105d3..85c7c52a 100644 --- a/src/base/indexer/BulkIndexer.h +++ b/src/base/indexer/BulkIndexer.h @@ -30,8 +30,8 @@ struct sitexyzt { int y; int z; int t; - HOST_DEVICE sitexyzt(int x, int y, int z, int t) : x(x), y(y), z(z), t(t) {}; - HOST_DEVICE inline int& operator[](const int i) { + __host__ __device__ sitexyzt(int x, int y, int z, int t) : x(x), y(y), z(z), t(t) {}; + __host__ __device__ inline int& operator[](const int i) { if(i == 0) return x; if(i == 1) return y; if(i == 2) return z; @@ -58,20 +58,20 @@ struct gSite { sitexyzt coord, coordFull; // These constructors should only be called from GIndexer. - HOST_DEVICE inline gSite() : isite(0), isiteFull(0), coord(0, 0, 0, 0), coordFull(0, 0, 0, 0) {} + __host__ __device__ inline gSite() : isite(0), isiteFull(0), coord(0, 0, 0, 0), coordFull(0, 0, 0, 0) {} - HOST_DEVICE inline gSite(size_t isite, size_t isiteFull, sitexyzt coord, sitexyzt coordFull) : + __host__ __device__ inline gSite(size_t isite, size_t isiteFull, sitexyzt coord, sitexyzt coordFull) : isite(isite), isiteFull(isiteFull), coord(coord), coordFull(coordFull) {}; - SQCD_HOST friend inline std::ostream &operator << (std::ostream &s, const gSite &site) { + __host__ friend inline std::ostream &operator << (std::ostream &s, const gSite &site) { s << "gSite: coord: " << site.coord.x << " " << site.coord.y << " " << site.coord.z << " " << site.coord.t << " " << "coordFull: " << site.coordFull.x << " " << site.coordFull.y << " " << site.coordFull.z << " " << site.coordFull.t << " " << "isite: " << site.isite << " isiteFull: " << site.isiteFull; return s; } - SQCD_HOST inline std::string getStr() { + __host__ inline std::string getStr() { std::ostringstream s; s << "gSite: coord: " << coord.x << " " << coord.y << " " << coord.z << " " << coord.t << " " << "coordFull: " << coordFull.x << " " << coordFull.y << " " << coordFull.z << " " << coordFull.t << " " @@ -85,18 +85,18 @@ struct gSiteStack : public gSite { size_t isiteStackFull; size_t stack; - HOST_DEVICE gSiteStack() : gSite(), isiteStack(0), isiteStackFull(0), stack(0){} + __host__ __device__ gSiteStack() : gSite(), isiteStack(0), isiteStackFull(0), stack(0){} - HOST_DEVICE gSiteStack(size_t isite, size_t isiteFull, sitexyzt coord, + __host__ __device__ gSiteStack(size_t isite, size_t isiteFull, sitexyzt coord, sitexyzt coordFull, size_t isiteStack, size_t isiteStackFull, size_t stack) : gSite(isite, isiteFull, coord, coordFull), isiteStack(isiteStack), isiteStackFull(isiteStackFull), stack(stack){} - HOST_DEVICE gSiteStack(gSite site, size_t isiteStack, size_t isiteStackFull, size_t stack) : + __host__ __device__ gSiteStack(gSite site, size_t isiteStack, size_t isiteStackFull, size_t stack) : gSite(site), isiteStack(isiteStack), isiteStackFull(isiteStackFull), stack(stack){} gSiteStack(const gSite) = delete; - SQCD_HOST friend inline std::ostream &operator << (std::ostream &s, const gSiteStack &site) { + __host__ friend inline std::ostream &operator << (std::ostream &s, const gSiteStack &site) { s << "gSiteStack: coord: " << site.coord.x << " " << site.coord.y << " " << site.coord.z << " " << site.coord.t << " " << " coordFull: " << site.coordFull.x << " " << site.coordFull.y << " " << site.coordFull.z << " " << site.coordFull.t << " " << " isite: " << site.isite << " isiteFull: " << site.isiteFull << " stack: " << site.stack @@ -110,19 +110,19 @@ struct gSiteMu : public gSite { // Link direction. uint8_t mu; - HOST_DEVICE gSiteMu() : gSite(), indexMuFull(0), mu(0){} + __host__ __device__ gSiteMu() : gSite(), indexMuFull(0), mu(0){} - HOST_DEVICE gSiteMu(size_t isite, size_t isiteFull, sitexyzt coord, + __host__ __device__ gSiteMu(size_t isite, size_t isiteFull, sitexyzt coord, sitexyzt coordFull, size_t indexMuFull, uint8_t mu) : gSite(isite, isiteFull, coord, coordFull), indexMuFull(indexMuFull), mu(mu){} - HOST_DEVICE gSiteMu(gSite site, size_t indexMuFull, uint8_t mu) + __host__ __device__ gSiteMu(gSite site, size_t indexMuFull, uint8_t mu) : gSite(site), indexMuFull(indexMuFull), mu(mu) {} gSiteMu(const gSite) = delete; gSiteMu(const gSiteStack) = delete; - SQCD_HOST friend inline std::ostream &operator << (std::ostream &s, const gSiteMu &site) { + __host__ friend inline std::ostream &operator << (std::ostream &s, const gSiteMu &site) { s << "gSite: coord: " << site.coord.x << " " << site.coord.y << " " << site.coord.z << " " << site.coord.t << " " << "coordFull: " << site.coordFull.x << " " << site.coordFull.y << " " << site.coordFull.z << " " << site.coordFull.t << " " << "isite: " << site.isite @@ -134,14 +134,14 @@ struct gSiteMu : public gSite { }; //! you can use these print functions for debugging, but in production code they are unused: -__attribute__((unused)) void HOST_DEVICE inline printGSite(const gSite& site) { +__attribute__((unused)) void __host__ __device__ inline printGSite(const gSite& site) { printf("Coord: %d %d %d %d, coordFull: %d %d %d %d, isite: %lu, isiteFull %lu\n", site.coord.x, site.coord.y, site.coord.z, site.coord.t, site.coordFull.x, site.coordFull.y, site.coordFull.z, site.coordFull.t, site.isite, site.isiteFull); } -__attribute__((unused)) void HOST_DEVICE inline printGSiteStack(const gSiteStack& site) { +__attribute__((unused)) void __host__ __device__ inline printGSiteStack(const gSiteStack& site) { printf("Coord: %d %d %d %d, coordFull: %d %d %d %d, isite: %lu, isiteFull %lu, stack: %lu, isiteStack: %lu, isiteStackFull %lu\n", site.coord.x, site.coord.y, site.coord.z, site.coord.t, site.coordFull.x, site.coordFull.y, site.coordFull.z, site.coordFull.t, @@ -151,7 +151,7 @@ __attribute__((unused)) void HOST_DEVICE inline printGSiteStack(const gSiteStac site.isiteStack, site.isiteStackFull); } -__attribute__((unused)) void HOST_DEVICE inline printGSiteStack(const gSiteMu& site){ +__attribute__((unused)) void __host__ __device__ inline printGSiteStack(const gSiteMu& site){ printf("Coord: %d %d %d %d, coordFull: %d %d %d %d, isite: %lu, isiteFull %lu, mu: %d, indexMu_Full: %lu\n", site.coord.x, site.coord.y, site.coord.z, site.coord.t, site.coordFull.x, site.coordFull.y, site.coordFull.z, site.coordFull.t, @@ -182,7 +182,7 @@ struct LatticeData { LatticeData() {} - HOST_DEVICE LatticeData(size_t _lx, size_t _ly, size_t _lz, size_t _lt, size_t _HaloDepth, unsigned int _Nodes[4], + __host__ __device__ LatticeData(size_t _lx, size_t _ly, size_t _lz, size_t _lt, size_t _HaloDepth, unsigned int _Nodes[4], size_t _globX, size_t _globY, size_t _globZ, size_t _globT, size_t _gPosX, size_t _gPosY, size_t _gPosZ, size_t _gPosT) : @@ -225,7 +225,7 @@ struct LatticeData { gPosZ(_gPosZ), gPosT(_gPosT) {} - HOST_DEVICE sitexyzt globalPos(sitexyzt n) { + __host__ __device__ sitexyzt globalPos(sitexyzt n) { sitexyzt coord = sitexyzt(gPosX + n.x,gPosY + n.y,gPosZ + n.z,gPosT + n.t); @@ -237,7 +237,7 @@ struct LatticeData { return coord; } - HOST_DEVICE bool isLocal(sitexyzt globalsite){ + __host__ __device__ bool isLocal(sitexyzt globalsite){ //! make sure globalsite is valid, i.e. not negative or greater than lattice extents! // consider lattice 20 20 20 20 with split 2 2 1 1 @@ -265,7 +265,7 @@ struct LatticeData { return false; } - SQCD_HOST LatticeDimensions globalPos(LatticeDimensions n) { + __host__ LatticeDimensions globalPos(LatticeDimensions n) { LatticeDimensions coord = LatticeDimensions(gPosX,gPosY,gPosZ,gPosT) + n; @@ -277,22 +277,22 @@ struct LatticeData { return coord; } - SQCD_HOST LatticeDimensions globalLattice() { + __host__ LatticeDimensions globalLattice() { return LatticeDimensions(globLX,globLY,globLZ,globLT); } - SQCD_HOST LatticeDimensions localLattice() { + __host__ LatticeDimensions localLattice() { return LatticeDimensions(lx,ly,lz,lt); } - HOST_DEVICE sitexyzt globalLatticeXYZT() { + __host__ __device__ sitexyzt globalLatticeXYZT() { return sitexyzt(globLX,globLY,globLZ,globLT); } }; -extern DEVICE CONSTANT struct LatticeData globLatDataGPU[MAXHALO + 1]; +extern __device__ __constant__ struct LatticeData globLatDataGPU[MAXHALO + 1]; extern struct LatticeData globLatDataCPU[MAXHALO + 1]; /// --------------------------------------------------------------------------------------------- INDEXER INITIALIZATION @@ -313,8 +313,8 @@ void initIndexer(const size_t HaloDepth, const LatticeParameters ¶m, Communi template class GIndexer { public: - HOST_DEVICE GIndexer() = default; - HOST_DEVICE inline static LatticeData getLatData() { + __host__ __device__ GIndexer() = default; + __host__ __device__ inline static LatticeData getLatData() { #ifdef __GPU_ARCH__ return globLatDataGPU[HaloDepth]; @@ -325,7 +325,7 @@ class GIndexer { /// ---------------------------------------------------------------------------------------------------- getSite* /// BULK (NO HALOS) - HOST_DEVICE inline static gSite getSite(size_t isite) { + __host__ __device__ inline static gSite getSite(size_t isite) { sitexyzt coord(0, 0, 0, 0); sitexyzt coordFull(0, 0, 0, 0); size_t isiteFull = 0; @@ -344,10 +344,10 @@ class GIndexer { } return gSite(isite, isiteFull, coord, coordFull); } - HOST_DEVICE inline static gSite getSite(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { + __host__ __device__ inline static gSite getSite(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { return getSite(_blockDim.x * _blockIdx.x + _threadIdx.x); } - HOST_DEVICE inline static gSite getSite(int x, int y, int z, int t) { + __host__ __device__ inline static gSite getSite(int x, int y, int z, int t) { sitexyzt coord = sitexyzt(x, y, z, t); sitexyzt coordFull = coordToFullCoord(coord); size_t isite = 0; @@ -362,7 +362,7 @@ class GIndexer { } return gSite(isite, isiteFull, coord, coordFull); } - HOST_DEVICE inline static gSite getSite(sitexyzt coord) { + __host__ __device__ inline static gSite getSite(sitexyzt coord) { return getSite(coord.x,coord.y,coord.z,coord.t); } @@ -370,7 +370,7 @@ class GIndexer { happen whenever you call a kernel running over spacelike indices only. All coordinates will be of the form (x, y, z, 0). The indices isite and isiteFull will by bounded by their respective 3-volumes. The indexing needs to change, because there are fewer sites than with the full bulk.*/ - HOST_DEVICE inline static gSite getSiteSpatial(size_t isite) { + __host__ __device__ inline static gSite getSiteSpatial(size_t isite) { sitexyzt coord(0, 0, 0, 0); sitexyzt coordFull(0, 0, 0, 0); size_t isiteFull = 0; @@ -390,10 +390,10 @@ class GIndexer { return gSite(isite, isiteFull, coord, coordFull); } - HOST_DEVICE inline static gSite getSiteSpatial(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { + __host__ __device__ inline static gSite getSiteSpatial(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { return getSiteSpatial(_blockDim.x * _blockIdx.x + _threadIdx.x); } - HOST_DEVICE inline static gSite getSiteSpatial(int x, int y, int z, int t) { + __host__ __device__ inline static gSite getSiteSpatial(int x, int y, int z, int t) { // There is probably a way to allow t>0. My worry right now is that there is that if you allow // t>0, there is no longer a one-to-one correspondence between isite and coord. sitexyzt coord = sitexyzt(x, y, z, t); @@ -411,7 +411,7 @@ class GIndexer { } /// FULL (WITH HALOS) - HOST_DEVICE inline static gSite getSiteFull(size_t isiteFull) { + __host__ __device__ inline static gSite getSiteFull(size_t isiteFull) { sitexyzt coord(0, 0, 0, 0); sitexyzt coordFull(0, 0, 0, 0); size_t isite = 0; @@ -431,11 +431,11 @@ class GIndexer { return gSite(isite, isiteFull, coord, coordFull); } - HOST_DEVICE inline static gSite getSiteFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { + __host__ __device__ inline static gSite getSiteFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { return getSiteFull(_blockDim.x * _blockIdx.x + _threadIdx.x); } - HOST_DEVICE inline static gSite getSiteFull(int x, int y, int z, int t) { + __host__ __device__ inline static gSite getSiteFull(int x, int y, int z, int t) { sitexyzt coordFull = sitexyzt(x, y, z, t); sitexyzt coord = fullCoordToCoord(coordFull); size_t isite = 0; @@ -449,11 +449,11 @@ class GIndexer { } return gSite(isite, isiteFull, coord, coordFull); } - HOST_DEVICE inline static gSite getSiteFull(sitexyzt coordfull) { + __host__ __device__ inline static gSite getSiteFull(sitexyzt coordfull) { return getSiteFull(coordfull.x,coordfull.y,coordfull.z,coordfull.t); } - HOST_DEVICE inline static gSite getSiteSpatialFull(size_t isiteFull) { + __host__ __device__ inline static gSite getSiteSpatialFull(size_t isiteFull) { sitexyzt coord(0, 0, 0, 0); sitexyzt coordFull(0, 0, 0, 0); size_t isite = 0; @@ -473,10 +473,10 @@ class GIndexer { return gSite(isite, isiteFull, coord, coordFull); } - HOST_DEVICE inline static gSite getSiteSpatialFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { + __host__ __device__ inline static gSite getSiteSpatialFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { return getSiteSpatialFull(_blockDim.x * _blockIdx.x + _threadIdx.x); } - HOST_DEVICE inline static gSite getSiteSpatialFull(int x, int y, int z, int t) { + __host__ __device__ inline static gSite getSiteSpatialFull(int x, int y, int z, int t) { sitexyzt coordFull = sitexyzt(x, y, z, t); sitexyzt coord = fullCoordToCoord(coordFull); size_t isite = 0; @@ -495,62 +495,62 @@ class GIndexer { /// BULK (NO HALOS) //! two helper functions for getSiteMu* - HOST_DEVICE inline static size_t coordMuToIndexMu_Full(const int x, const int y, const int z, const int t, const int mu) { + __host__ __device__ inline static size_t coordMuToIndexMu_Full(const int x, const int y, const int z, const int t, const int mu) { return (((x + y*getLatData().vol1Full + z*getLatData().vol2Full + t*getLatData().vol3Full) >> 0x1) // integer division by two +getLatData().sizehFull*((x + y + z + t) & 0x1) // 0 if x+y+z+t is even, 1 if it is odd + mu*getLatData().vol4Full); } - HOST_DEVICE inline static size_t indexMu_Full(const gSite site, const int mu) { + __host__ __device__ inline static size_t indexMu_Full(const gSite site, const int mu) { return coordMuToIndexMu_Full(site.coordFull.x, site.coordFull.y, site.coordFull.z, site.coordFull.t, mu); } - HOST_DEVICE inline static gSiteMu getSiteMu(size_t isite, size_t mu) { + __host__ __device__ inline static gSiteMu getSiteMu(size_t isite, size_t mu) { gSite site(getSite(isite)); size_t indexmufull = indexMu_Full(site, mu); return gSiteMu(site, indexmufull, mu); } - HOST_DEVICE inline static gSiteMu getSiteMu(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, size_t mu){ + __host__ __device__ inline static gSiteMu getSiteMu(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, size_t mu){ return getSiteMu(_blockDim.x * _blockIdx.x + _threadIdx.x, mu); } - HOST_DEVICE inline static gSiteMu getSiteMu(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ + __host__ __device__ inline static gSiteMu getSiteMu(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ //! It gets the mu index from the y direction of the block. return getSiteMu(_blockDim.x * _blockIdx.x + _threadIdx.x, _threadIdx.y); } - HOST_DEVICE inline static gSiteMu getSiteMu(gSite site, size_t mu) { + __host__ __device__ inline static gSiteMu getSiteMu(gSite site, size_t mu) { size_t indexmufull = indexMu_Full(site, mu); return gSiteMu(site, indexmufull, mu); } - HOST_DEVICE inline static gSiteMu getSiteMu(int x, int y, int z, int t, size_t mu){ + __host__ __device__ inline static gSiteMu getSiteMu(int x, int y, int z, int t, size_t mu){ return getSiteMu(getSite(x, y, z, t), mu); } - HOST_DEVICE inline static gSiteMu getSiteSpatialMu(size_t isite, size_t mu) { + __host__ __device__ inline static gSiteMu getSiteSpatialMu(size_t isite, size_t mu) { gSite site(getSiteSpatial(isite)); size_t indexmufull = indexMu_Full(site, mu); return gSiteMu(site, indexmufull, mu); } /// FULL (WITH HALOS) - HOST_DEVICE inline static gSiteMu getSiteMuFull(size_t isiteFull, size_t mu) { + __host__ __device__ inline static gSiteMu getSiteMuFull(size_t isiteFull, size_t mu) { gSite site(getSiteFull(isiteFull)); size_t indexmufull = indexMu_Full(site, mu); return gSiteMu(site, indexmufull, mu); } - HOST_DEVICE inline static gSiteMu getSiteMuFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, size_t mu){ + __host__ __device__ inline static gSiteMu getSiteMuFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, size_t mu){ return getSiteMuFull(_blockDim.x * _blockIdx.x + _threadIdx.x, mu); } - HOST_DEVICE inline static gSiteMu getSiteMuFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ + __host__ __device__ inline static gSiteMu getSiteMuFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ //!get the mu index from the y direction of the block. return getSiteMuFull(_blockDim.x * _blockIdx.x + _threadIdx.x, _threadIdx.y); } - HOST_DEVICE inline static gSiteMu getSiteMuFull(int x, int y, int z, int t, size_t mu){ + __host__ __device__ inline static gSiteMu getSiteMuFull(int x, int y, int z, int t, size_t mu){ return getSiteMu(getSiteFull(x, y, z, t), mu); } /// --------------------------------------------------------------------------------------------------- getSiteStack /// BULK (NO HALOS) - HOST_DEVICE inline static gSiteStack getSiteStack(const gSite& site, const size_t stack){ + __host__ __device__ inline static gSiteStack getSiteStack(const gSite& site, const size_t stack){ size_t isiteStack; size_t isiteStackFull; if (LatLayout == All) { @@ -563,24 +563,24 @@ class GIndexer { gSiteStack ret(site, isiteStack, isiteStackFull, stack); return ret; } - HOST_DEVICE inline static gSiteStack getSiteStack(const size_t isite, const size_t stack){ + __host__ __device__ inline static gSiteStack getSiteStack(const size_t isite, const size_t stack){ return getSiteStack(getSite(isite), stack); } - HOST_DEVICE inline static gSiteStack getSiteStack(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ + __host__ __device__ inline static gSiteStack getSiteStack(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ return getSiteStack(_blockDim.x * _blockIdx.x + _threadIdx.x, _threadIdx.y); } - HOST_DEVICE inline static gSiteStack getSiteStack(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, const size_t stack){ + __host__ __device__ inline static gSiteStack getSiteStack(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, const size_t stack){ return getSiteStack(_blockDim.x * _blockIdx.x + _threadIdx.x, stack); } - HOST_DEVICE inline static gSiteStack getSiteStack(int x, int y, int z, int t, int stack) { + __host__ __device__ inline static gSiteStack getSiteStack(int x, int y, int z, int t, int stack) { return getSiteStack(getSite(x, y, z, t), stack); } - HOST_DEVICE inline static gSiteStack getSiteStack(sitexyzt coord, int stack) { + __host__ __device__ inline static gSiteStack getSiteStack(sitexyzt coord, int stack) { return getSiteStack(getSite(coord.x, coord.y, coord.z, coord.t), stack); } - HOST_DEVICE inline static gSiteStack getSiteStackOdd(const gSite& site, const size_t stack){ + __host__ __device__ inline static gSiteStack getSiteStackOdd(const gSite& site, const size_t stack){ size_t isiteStack; size_t isiteStackFull; if (LatLayout == All) { @@ -593,60 +593,60 @@ class GIndexer { gSiteStack ret(site, isiteStack, isiteStackFull, stack); return ret; } - HOST_DEVICE inline static gSiteStack getSiteStackOdd(const size_t isite, const size_t stack){ + __host__ __device__ inline static gSiteStack getSiteStackOdd(const size_t isite, const size_t stack){ return getSiteStackOdd(getSite(isite), stack); } - HOST_DEVICE inline static gSiteStack getSiteStackOdd(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ + __host__ __device__ inline static gSiteStack getSiteStackOdd(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ return getSiteStackOdd(_blockDim.x * _blockIdx.x + _threadIdx.x, _threadIdx.y); } - HOST_DEVICE inline static gSiteStack getSiteStackOdd(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, const size_t stack){ + __host__ __device__ inline static gSiteStack getSiteStackOdd(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, const size_t stack){ return getSiteStackOdd(_blockDim.x * _blockIdx.x + _threadIdx.x, stack); } /// FULL (WITH HALOS) - HOST_DEVICE inline static gSiteStack getSiteStackFull(const size_t isiteFull, const size_t stack){ + __host__ __device__ inline static gSiteStack getSiteStackFull(const size_t isiteFull, const size_t stack){ return getSiteStack(getSiteFull(isiteFull), stack); } - HOST_DEVICE inline static gSiteStack getSiteStackFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ + __host__ __device__ inline static gSiteStack getSiteStackFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ gSiteStack ret = getSiteStackFull(_blockDim.x * _blockIdx.x + _threadIdx.x, _threadIdx.y); return ret; } - HOST_DEVICE inline static gSiteStack getSiteStackFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, const size_t stack){ + __host__ __device__ inline static gSiteStack getSiteStackFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, const size_t stack){ gSiteStack ret = getSiteStackFull(_blockDim.x * _blockIdx.x + _threadIdx.x, stack); return ret; } - HOST_DEVICE inline static gSiteStack getSiteStackFull(int x, int y, int z, int t, int stack) { + __host__ __device__ inline static gSiteStack getSiteStackFull(int x, int y, int z, int t, int stack) { return getSiteStack(getSiteFull(x, y, z, t), stack); } /// ----------------------------------------------------------------------------------- CONVERT BETWEEN EVEN AND ODD - template HOST_DEVICE inline static gSite convertSite(const gSite& site){ + template __host__ __device__ inline static gSite convertSite(const gSite& site){ return GIndexer::getSite(site.coord.x, site.coord.y, site.coord.z, site.coord.t); } - template HOST_DEVICE inline static gSiteMu convertSite(const gSiteMu& site){ + template __host__ __device__ inline static gSiteMu convertSite(const gSiteMu& site){ return GIndexer::getSiteMu(site.coord.x, site.coord.y, site.coord.z, site.coord.t, site.mu); } - template HOST_DEVICE inline static gSiteStack convertSite(const gSiteStack& site){ + template __host__ __device__ inline static gSiteStack convertSite(const gSiteStack& site){ return GIndexer::getSiteStack(site.coord.x, site.coord.y, site.coord.z, site.coord.t, site.stack); } //! Given an Even/Odd gSite object, this returns an All gSite object. - HOST_DEVICE inline static gSite convertToAll(gSite& site) { + __host__ __device__ inline static gSite convertToAll(gSite& site) { size_t isite = site.isite + (LatLayout == Odd)*getLatData().sizeh; size_t isiteFull = site.isiteFull + (LatLayout == Odd)*getLatData().sizehFull; return gSite(isite, isiteFull, site.coord, site.coordFull); } /// ------------------------------------------------ CONVERT BETWEEN BULK SPACETIME COORDINATES AND FULL COORDINATES - HOST_DEVICE inline static sitexyzt coordToFullCoord(sitexyzt coord) { + __host__ __device__ inline static sitexyzt coordToFullCoord(sitexyzt coord) { coord.x += getLatData().HaloDepth[0]; coord.y += getLatData().HaloDepth[1]; coord.z += getLatData().HaloDepth[2]; coord.t += getLatData().HaloDepth[3]; return coord; } - HOST_DEVICE inline static sitexyzt fullCoordToCoord(sitexyzt fullCoord) { + __host__ __device__ inline static sitexyzt fullCoordToCoord(sitexyzt fullCoord) { fullCoord.x -= getLatData().HaloDepth[0]; fullCoord.y -= getLatData().HaloDepth[1]; fullCoord.z -= getLatData().HaloDepth[2]; @@ -654,7 +654,7 @@ class GIndexer { return fullCoord; } - HOST_DEVICE inline static sitexyzt globalCoordToLocalCoord(sitexyzt coord) { + __host__ __device__ inline static sitexyzt globalCoordToLocalCoord(sitexyzt coord) { coord.x -= getLatData().gPosX; coord.y -= getLatData().gPosY; coord.z -= getLatData().gPosZ; @@ -664,47 +664,47 @@ class GIndexer { /// -------------------------------------------------------------------- CONVERT SPACETIME COORDINATES TO DATA INDEX /// BULK (NO HALOS) - HOST_DEVICE inline static size_t coordToIndex_Bulk(const sitexyzt coord) { + __host__ __device__ inline static size_t coordToIndex_Bulk(const sitexyzt coord) { return (((coord.x + coord.y*getLatData().vol1 + coord.z*getLatData().vol2 + coord.t*getLatData().vol3) >> 0x1) // integer division by two +getLatData().sizeh * ((coord.x + coord.y + coord.z + coord.t) & 0x1)); // 0 if x+y+z+t is even, 1 if it is odd } - HOST_DEVICE inline static size_t coordToIndex_Bulk_eo(const sitexyzt coord) { + __host__ __device__ inline static size_t coordToIndex_Bulk_eo(const sitexyzt coord) { return ((coord.x + coord.y*getLatData().vol1 + coord.z*getLatData().vol2 + coord.t*getLatData().vol3) >> 0x1); } - HOST_DEVICE inline static size_t coordToIndex_SpatialBulk(const sitexyzt coord) { + __host__ __device__ inline static size_t coordToIndex_SpatialBulk(const sitexyzt coord) { return (((coord.x + coord.y*getLatData().vol1 + coord.z*getLatData().vol2) >> 0x1) + getLatData().vol3h*((coord.x + coord.y + coord.z) & 0x1)); } - HOST_DEVICE inline static size_t coordToIndex_SpatialBulk_eo(const sitexyzt coord) { + __host__ __device__ inline static size_t coordToIndex_SpatialBulk_eo(const sitexyzt coord) { return ((coord.x + coord.y*getLatData().vol1 + coord.z*getLatData().vol2) >> 0x1); } /// FULL (WITH HALOS) - HOST_DEVICE inline static size_t coordToIndex_Full(const sitexyzt coordFull) { + __host__ __device__ inline static size_t coordToIndex_Full(const sitexyzt coordFull) { return (((coordFull.x + coordFull.y*getLatData().vol1Full + coordFull.z*getLatData().vol2Full + coordFull.t*getLatData().vol3Full) >> 0x1) + getLatData().sizehFull*((coordFull.x + coordFull.y + coordFull.z + coordFull.t) & 0x1)); } - HOST_DEVICE inline static size_t coordToIndex_Full_eo(const sitexyzt coordFull) { + __host__ __device__ inline static size_t coordToIndex_Full_eo(const sitexyzt coordFull) { return ((coordFull.x + coordFull.y * getLatData().vol1Full + coordFull.z * getLatData().vol2Full + coordFull.t * getLatData().vol3Full) >> 0x1); } - HOST_DEVICE inline static size_t coordToIndex_SpatialFull(const sitexyzt coordFull) { + __host__ __device__ inline static size_t coordToIndex_SpatialFull(const sitexyzt coordFull) { return (((coordFull.x + coordFull.y*getLatData().vol1Full + coordFull.z*getLatData().vol2Full) >> 0x1) + getLatData().vol3hFull*((coordFull.x + coordFull.y + coordFull.z) & 0x1)); } - HOST_DEVICE inline static size_t coordToIndex_SpatialFull_eo(const sitexyzt coordFull) { + __host__ __device__ inline static size_t coordToIndex_SpatialFull_eo(const sitexyzt coordFull) { return ((coordFull.x + coordFull.y*getLatData().vol1Full + coordFull.z*getLatData().vol2Full) >> 0x1); } - SQCD_HOST inline static size_t localCoordToGlobalIndex(LatticeDimensions coord) { + __host__ inline static size_t localCoordToGlobalIndex(LatticeDimensions coord) { LatticeData lat = GIndexer::getLatData(); LatticeDimensions globCoord = lat.globalPos(coord); return (globCoord[0] + globCoord[1] * lat.globLX + globCoord[2] * lat.globLX * lat.globLY + @@ -713,7 +713,7 @@ class GIndexer { /// -------------------------------------------------------------------- CONVERT DATA INDEX TO SPACETIME COORDINATES /// BULK (NO HALOS) - HOST_DEVICE inline static sitexyzt indexToCoord(const size_t site) { + __host__ __device__ inline static sitexyzt indexToCoord(const size_t site) { int x, y, z, t; int par, normInd, tmp; @@ -751,7 +751,7 @@ class GIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt indexToCoord_eo(const size_t site, int par) { + __host__ __device__ inline static sitexyzt indexToCoord_eo(const size_t site, int par) { int x, y, z, t; int tmp; // double site @@ -769,7 +769,7 @@ class GIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt indexToCoord_Spatial(const size_t site) { + __host__ __device__ inline static sitexyzt indexToCoord_Spatial(const size_t site) { int x, y, z, t; int par, normInd, tmp; @@ -788,7 +788,7 @@ class GIndexer { return sitexyzt(x,y,z,t); } - HOST_DEVICE inline static sitexyzt indexToCoord_Spatial_eo(const size_t site, int par) { + __host__ __device__ inline static sitexyzt indexToCoord_Spatial_eo(const size_t site, int par) { int x, y, z, t; int tmp; size_t sited = site << 0x1; @@ -806,7 +806,7 @@ class GIndexer { return sitexyzt(x, y, z, t); } /// FULL (WITH HALOS) - HOST_DEVICE inline static sitexyzt indexToCoord_Full(const size_t siteFull) { + __host__ __device__ inline static sitexyzt indexToCoord_Full(const size_t siteFull) { int x, y, z, t; int par, normInd, tmp; @@ -826,7 +826,7 @@ class GIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt indexToCoord_SpatialFull(const size_t siteFull) { + __host__ __device__ inline static sitexyzt indexToCoord_SpatialFull(const size_t siteFull) { int x, y, z, t; int par, normInd, tmp; @@ -846,7 +846,7 @@ class GIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt indexToCoord_Full_eo(const size_t siteFull, int par) { + __host__ __device__ inline static sitexyzt indexToCoord_Full_eo(const size_t siteFull, int par) { int x, y, z, t; int tmp; @@ -863,7 +863,7 @@ class GIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt indexToCoord_SpatialFull_eo(const size_t siteFull, int par) { + __host__ __device__ inline static sitexyzt indexToCoord_SpatialFull_eo(const size_t siteFull, int par) { int x, y, z, t; int tmp; @@ -884,7 +884,7 @@ class GIndexer { //! This function is needed when one wants to have the sites time ordered. For example if one wants to reduce only //! values on each timeslice. - HOST_DEVICE inline static size_t siteTimeOrdered(const gSite &site) { + __host__ __device__ inline static size_t siteTimeOrdered(const gSite &site) { sitexyzt c = site.coord; return c.x + c.y*getLatData().vol1 + c.z*getLatData().vol2 + c.t*getLatData().vol3; } @@ -896,19 +896,19 @@ class GIndexer { //! time, this means you cannot pass these functions a dynamic argument. /// --------------------------------------------------------------------------------------- site_move: ONE DIRECTION - template HOST_DEVICE inline static gSite site_move(const gSite &s, const int mu) { + template __host__ __device__ inline static gSite site_move(const gSite &s, const int mu) { sitexyzt tmp = site_move(s.coordFull, mu); return getSiteFull(tmp.x, tmp.y, tmp.z, tmp.t); } - template HOST_DEVICE inline static gSiteMu site_move(const gSiteMu &s, const int mu) { + template __host__ __device__ inline static gSiteMu site_move(const gSiteMu &s, const int mu) { sitexyzt tmp = site_move(s.coordFull, mu); return getSiteMuFull(tmp.x, tmp.y, tmp.z, tmp.t, s.mu); } - template HOST_DEVICE inline static gSiteStack site_move(const gSiteStack &s, const int mu) { + template __host__ __device__ inline static gSiteStack site_move(const gSiteStack &s, const int mu) { sitexyzt tmp = site_move(s.coordFull, mu); return getSiteStackFull(tmp.x, tmp.y, tmp.z, tmp.t, s.stack); } - template HOST_DEVICE inline static sitexyzt site_move(sitexyzt s, const int mu) { + template __host__ __device__ inline static sitexyzt site_move(sitexyzt s, const int mu) { int x = s.x; int y = s.y; @@ -975,19 +975,19 @@ class GIndexer { } /// -------------------------------------------------------------------------------------- site_move: TWO DIRECTIONS - template HOST_DEVICE inline static gSite site_move(const gSite &s, const int mu, const int nu) { + template __host__ __device__ inline static gSite site_move(const gSite &s, const int mu, const int nu) { sitexyzt tmp = site_move(s.coordFull, mu, nu); return getSiteFull(tmp.x, tmp.y, tmp.z, tmp.t); } - template HOST_DEVICE inline static gSiteMu site_move(const gSiteMu &s, const int mu, const int nu) { + template __host__ __device__ inline static gSiteMu site_move(const gSiteMu &s, const int mu, const int nu) { sitexyzt tmp = site_move(s.coordFull, mu, nu); return getSiteMuFull(tmp.x, tmp.y, tmp.z, tmp.t, s.mu); } - template HOST_DEVICE inline static gSiteStack site_move(const gSiteStack &s, const int mu, const int nu) { + template __host__ __device__ inline static gSiteStack site_move(const gSiteStack &s, const int mu, const int nu) { sitexyzt tmp = site_move(s.coordFull, mu, nu); return getSiteStackFull(tmp.x, tmp.y, tmp.z, tmp.t, s.stack); } - template HOST_DEVICE inline static sitexyzt site_move(const sitexyzt &s, const int mu, const int nu) { + template __host__ __device__ inline static sitexyzt site_move(const sitexyzt &s, const int mu, const int nu) { int x = s.x; int y = s.y; int z = s.z; @@ -1110,22 +1110,22 @@ class GIndexer { /// ------------------------------------------------------------------------------------ site_move: THREE DIRECTIONS template - HOST_DEVICE inline static gSite site_move(const gSite &s, const int mu, const int nu, const int rho) { + __host__ __device__ inline static gSite site_move(const gSite &s, const int mu, const int nu, const int rho) { sitexyzt tmp = site_move(s.coordFull, mu, nu, rho); return getSiteFull(tmp.x, tmp.y, tmp.z, tmp.t); } template - HOST_DEVICE inline static gSiteMu site_move(const gSiteMu &s, const int mu, const int nu, const int rho) { + __host__ __device__ inline static gSiteMu site_move(const gSiteMu &s, const int mu, const int nu, const int rho) { sitexyzt tmp = site_move(s.coordFull, mu, nu, rho); return getSiteMuFull(tmp.x, tmp.y, tmp.z, tmp.t, s.mu); } template - HOST_DEVICE inline static gSiteStack site_move(const gSiteStack &s, const int mu, const int nu, const int rho) { + __host__ __device__ inline static gSiteStack site_move(const gSiteStack &s, const int mu, const int nu, const int rho) { sitexyzt tmp = site_move(s.coordFull, mu, nu, rho); return getSiteStackFull(tmp.x, tmp.y, tmp.z, tmp.t, s.stack); } template - HOST_DEVICE inline static sitexyzt site_move(const sitexyzt &s, const int mu, const int nu, const int rho) { + __host__ __device__ inline static sitexyzt site_move(const sitexyzt &s, const int mu, const int nu, const int rho) { int x = s.x; int y = s.y; int z = s.z; @@ -1304,22 +1304,22 @@ class GIndexer { /// ------------------------------------------------------------------------------------- site_move: FOUR DIRECTIONS template - HOST_DEVICE inline static gSite site_move(const gSite &s, const int mu, const int nu, const int rho, const int sig) { + __host__ __device__ inline static gSite site_move(const gSite &s, const int mu, const int nu, const int rho, const int sig) { sitexyzt tmp = site_move(s.coordFull, mu, nu, rho, sig); return getSiteFull(tmp.x, tmp.y, tmp.z, tmp.t); } template - HOST_DEVICE inline static gSiteMu site_move(const gSiteMu &s, const int mu, const int nu, const int rho, const int sig) { + __host__ __device__ inline static gSiteMu site_move(const gSiteMu &s, const int mu, const int nu, const int rho, const int sig) { sitexyzt tmp = site_move(s.coordFull, mu, nu, rho, sig); return getSiteMuFull(tmp.x, tmp.y, tmp.z, tmp.t, s.mu); } template - HOST_DEVICE inline static gSiteStack site_move(const gSiteStack &s, const int mu, const int nu, const int rho, const int sig) { + __host__ __device__ inline static gSiteStack site_move(const gSiteStack &s, const int mu, const int nu, const int rho, const int sig) { sitexyzt tmp = site_move(s.coordFull, mu, nu, rho, sig); return getSiteStackFull(tmp.x, tmp.y, tmp.z, tmp.t, s.stack); } template - HOST_DEVICE inline static sitexyzt site_move(const sitexyzt &s, const int mu, const int nu, const int rho, const int sig) { + __host__ __device__ inline static sitexyzt site_move(const sitexyzt &s, const int mu, const int nu, const int rho, const int sig) { int x = s.x; int y = s.y; int z = s.z; @@ -1553,55 +1553,55 @@ class GIndexer { } /// ------------------------------------------------------------------------------------------------ site_up and site_dn - template HOST_DEVICE inline static T site_up(const T &s, const int mu) { + template __host__ __device__ inline static T site_up(const T &s, const int mu) { return site_move<1>(s, mu); } - template HOST_DEVICE inline static T site_dn(const T &s, const int mu) { + template __host__ __device__ inline static T site_dn(const T &s, const int mu) { return site_move<-1>(s, mu); } - template HOST_DEVICE inline static T site_up_up(const T &s, const int mu, const int nu) { + template __host__ __device__ inline static T site_up_up(const T &s, const int mu, const int nu) { #ifdef __GPU_ARCH__ return site_move<1, 1>(s, mu, nu); #else return site_up(site_up(s, mu), nu); #endif } - template HOST_DEVICE inline static T site_up_dn(const T &s, const int mu, const int nu) { + template __host__ __device__ inline static T site_up_dn(const T &s, const int mu, const int nu) { #ifdef __GPU_ARCH__ return site_move<1, -1>(s, mu, nu); #else return site_dn(site_up(s, mu), nu); #endif } - template HOST_DEVICE inline static T site_dn_dn(const T &s, const int mu, const int nu) { + template __host__ __device__ inline static T site_dn_dn(const T &s, const int mu, const int nu) { #ifdef __GPU_ARCH__ return site_move<-1, -1>(s, mu, nu); #else return site_dn(site_dn(s, mu), nu); #endif } - template HOST_DEVICE inline static T site_up_up_up(const T &s, const int mu, const int nu, const int rho) { + template __host__ __device__ inline static T site_up_up_up(const T &s, const int mu, const int nu, const int rho) { #ifdef __GPU_ARCH__ return site_move<1, 1, 1>(s, mu, nu, rho); #else return site_up(site_up_up(s, mu, nu), rho); #endif } - template HOST_DEVICE inline static T site_up_up_dn(const T &s, const int mu, const int nu, const int rho) { + template __host__ __device__ inline static T site_up_up_dn(const T &s, const int mu, const int nu, const int rho) { #ifdef __GPU_ARCH__ return site_move<1, 1, -1>(s, mu, nu, rho); #else return site_dn(site_up_up(s, mu, nu), rho); #endif } - template HOST_DEVICE inline static T site_up_dn_dn(const T &s, const int mu, const int nu, const int rho) { + template __host__ __device__ inline static T site_up_dn_dn(const T &s, const int mu, const int nu, const int rho) { #ifdef __GPU_ARCH__ return site_move<1, -1, -1>(s, mu, nu, rho); #else return site_dn(site_up_dn(s, mu, nu), rho); #endif } - template HOST_DEVICE inline static T site_dn_dn_dn(const T &s, const int mu, const int nu, const int rho) { + template __host__ __device__ inline static T site_dn_dn_dn(const T &s, const int mu, const int nu, const int rho) { #ifdef __GPU_ARCH__ return site_move<-1, -1, -1>(s, mu, nu, rho); #else @@ -1609,70 +1609,70 @@ class GIndexer { #endif } //! The following are currently unused but can be commented in if needed: - template HOST_DEVICE inline static T site_up_up_up_up(const T &s, const int mu, const int nu, const int rho, const int sig) { + template __host__ __device__ inline static T site_up_up_up_up(const T &s, const int mu, const int nu, const int rho, const int sig) { #ifdef __GPU_ARCH__ return site_move<1, 1, 1, 1>(s, mu, nu, rho, sig); #else return site_up(site_up_up_up(s, mu, nu, rho), sig); #endif } - template HOST_DEVICE inline static T site_up_up_up_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { + template __host__ __device__ inline static T site_up_up_up_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { #ifdef __GPU_ARCH__ return site_move<1, 1, 1, -1>(s, mu, nu, rho, sig); #else return site_dn(site_up_up_up(s, mu, nu, rho), sig); #endif } - template HOST_DEVICE inline static T site_up_up_dn_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { + template __host__ __device__ inline static T site_up_up_dn_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { #ifdef __GPU_ARCH__ return site_move<1, 1, -1, -1>(s, mu, nu, rho, sig); #else return site_dn(site_up_up_dn(s, mu, nu, rho), sig); #endif } - template HOST_DEVICE inline static T site_up_dn_dn_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { + template __host__ __device__ inline static T site_up_dn_dn_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { #ifdef __GPU_ARCH__ return site_move<1, -1, -1, -1>(s, mu, nu, rho, sig); #else return site_dn(site_up_dn_dn(s, mu, nu, rho), sig); #endif } - template HOST_DEVICE inline static T site_dn_dn_dn_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { + template __host__ __device__ inline static T site_dn_dn_dn_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { #ifdef __GPU_ARCH__ return site_move<-1, -1, -1, -1>(s, mu, nu, rho, sig); #else return site_dn(site_dn_dn_dn(s, mu, nu, rho), sig); #endif } - template HOST_DEVICE inline static T site_2up_up(const T &s, const int mu, const int nu) { + template __host__ __device__ inline static T site_2up_up(const T &s, const int mu, const int nu) { #ifdef __GPU_ARCH__ return site_move<2, 1>(s, mu, nu); #else return site_up_up_up(s, mu, mu, nu); #endif } - template HOST_DEVICE inline static T site_2up_dn(const T &s, const int mu, const int nu) { + template __host__ __device__ inline static T site_2up_dn(const T &s, const int mu, const int nu) { #ifdef __GPU_ARCH__ return site_move<2, -1>(s, mu, nu); #else return site_up_up_dn(s, mu, mu, nu); #endif } - template HOST_DEVICE inline static T site_up_2dn(const T &s, const int mu, const int nu) { + template __host__ __device__ inline static T site_up_2dn(const T &s, const int mu, const int nu) { #ifdef __GPU_ARCH__ return site_move<1, -2>(s, mu, nu); #else return site_up_dn_dn(s, mu, mu, nu); #endif } - template HOST_DEVICE inline static T site_2up(const T &s, const int mu) { + template __host__ __device__ inline static T site_2up(const T &s, const int mu) { #ifdef __GPU_ARCH__ return site_move<2>(s, mu); #else return site_up_up(s, mu, mu); #endif } - template HOST_DEVICE inline static T site_2dn(const T &s, const int mu) { + template __host__ __device__ inline static T site_2dn(const T &s, const int mu) { #ifdef __GPU_ARCH__ return site_move<-2>(s, mu); #else @@ -1684,7 +1684,7 @@ class GIndexer { //! Unlike the above implementation of site_move, this can be used in a for loop. Presumably it is slower? //! Currently unused but can be commented in if needed: - HOST_DEVICE inline static sitexyzt dynamic_move(sitexyzt s, const int mu, int mu_steps) { + __host__ __device__ inline static sitexyzt dynamic_move(sitexyzt s, const int mu, int mu_steps) { int x = s.x; int y = s.y; int z = s.z; @@ -1747,7 +1747,7 @@ class GIndexer { } return sitexyzt(x, y, z, t); } - __attribute__((unused)) HOST_DEVICE inline static gSite dynamic_move(const gSite &s, const int mu, int mu_steps) { + __attribute__((unused)) __host__ __device__ inline static gSite dynamic_move(const gSite &s, const int mu, int mu_steps) { sitexyzt tmp = dynamic_move(s.coordFull, mu, mu_steps); return getSiteFull(tmp.x, tmp.y, tmp.z, tmp.t); } diff --git a/src/base/indexer/HaloIndexer.h b/src/base/indexer/HaloIndexer.h index b0346706..d746e7a0 100644 --- a/src/base/indexer/HaloIndexer.h +++ b/src/base/indexer/HaloIndexer.h @@ -99,9 +99,9 @@ struct HaloData { size_t h_offsetsHalf[80]; - HOST_DEVICE HaloData() {} + __host__ __device__ HaloData() {} - HOST_DEVICE HaloData(size_t lx, size_t ly, size_t lz, size_t lt, size_t halo_depth, unsigned int Nodes[4]) { + __host__ __device__ HaloData(size_t lx, size_t ly, size_t lz, size_t lt, size_t halo_depth, unsigned int Nodes[4]) { h_HaloDepth[0] = Nodes[0] != 1 ? halo_depth : 0; @@ -208,7 +208,7 @@ struct HaloData { } - HOST_DEVICE size_t getBufferSize(Layout LatLayout) { + __host__ __device__ size_t getBufferSize(Layout LatLayout) { if (LatLayout == All)return h_summed_buffer[15]; else return h_summed_bufferHalf[15]; } @@ -218,7 +218,7 @@ struct HaloData { /// This function returns the size of these sub_Halos. /// The argument is the number of the Sub-Halo! - HOST_DEVICE inline size_t get_SubHaloSize(const short number, Layout LatLayout) const { + __host__ __device__ inline size_t get_SubHaloSize(const short number, Layout LatLayout) const { size_t EvenFactor = 1; if (LatLayout != All) EvenFactor = 2; @@ -245,7 +245,7 @@ struct HaloData { private: /// The argument is the number of the Halo Type! It returns the size of an All Halo Type! - HOST_DEVICE inline size_t get_SubHaloSizeFromType(const short number) const { + __host__ __device__ inline size_t get_SubHaloSizeFromType(const short number) const { if (number == 0) return h_YZTH; if (number == 1) return h_XZTH; if (number == 2) return h_XYTH; @@ -286,7 +286,7 @@ class HaloIndexer { private: - HOST_DEVICE inline static size_t _getHaloNumber(size_t index, size_t *LocHalIndex) { + __host__ __device__ inline static size_t _getHaloNumber(size_t index, size_t *LocHalIndex) { if (LatLayout == All) { for (int i = 1; i < 80; i++) { if (getHalData().h_offsets[i] > index) { @@ -314,7 +314,7 @@ class HaloIndexer { return 0; }; - HOST_DEVICE inline static size_t _getHaloNumberReduced(size_t index, size_t *LocHalIndex) { + __host__ __device__ inline static size_t _getHaloNumberReduced(size_t index, size_t *LocHalIndex) { if (LatLayout == All) { for (int i = 1; i < 80; i++) { if (getHalDataReduced().h_offsets[i] > index) { @@ -343,11 +343,11 @@ class HaloIndexer { }; public: - HOST_DEVICE HaloIndexer(); + __host__ __device__ HaloIndexer(); - HOST_DEVICE ~HaloIndexer() {}; + __host__ __device__ ~HaloIndexer() {}; - HOST_DEVICE inline static HaloData getHalData() { + __host__ __device__ inline static HaloData getHalData() { #if defined(__GPU_ARCH__) return globHalDataGPU[HaloDepth]; #else @@ -355,7 +355,7 @@ class HaloIndexer { #endif } - HOST_DEVICE inline static HaloData getHalDataReduced() { + __host__ __device__ inline static HaloData getHalDataReduced() { #if defined(__GPU_ARCH__) return globHalDataGPUReduced[HaloDepth]; #else @@ -363,31 +363,31 @@ class HaloIndexer { #endif } - HOST_DEVICE inline static size_t getBufferSize() { + __host__ __device__ inline static size_t getBufferSize() { if (LatLayout == All)return getHalData().h_summed_buffer[15]; else return getHalData().h_summed_bufferHalf[15]; } - HOST_DEVICE inline static size_t get_SubHaloOffset(const short number) { + __host__ __device__ inline static size_t get_SubHaloOffset(const short number) { if (LatLayout == All)return getHalData().h_offsets[number]; else return getHalData().h_offsetsHalf[number]; } - HOST_DEVICE inline static size_t get_SubHaloSize(const short number) { + __host__ __device__ inline static size_t get_SubHaloSize(const short number) { return getHalData().get_SubHaloSize(number, LatLayout); } - HOST_DEVICE inline static size_t get_ReducedSubHaloSize(const short number) { + __host__ __device__ inline static size_t get_ReducedSubHaloSize(const short number) { return getHalDataReduced().get_SubHaloSize(number, LatLayout); } - HOST_DEVICE inline static void getCoord_eo(size_t &x, size_t &y, size_t &z, size_t &t, + __host__ __device__ inline static void getCoord_eo(size_t &x, size_t &y, size_t &z, size_t &t, const size_t index, const size_t vol1, const size_t vol2, const size_t vol3, const bool par) { @@ -407,7 +407,7 @@ class HaloIndexer { ++x; } - HOST_DEVICE inline static void getCoord(size_t &x, size_t &y, size_t &z, size_t &t, + __host__ __device__ inline static void getCoord(size_t &x, size_t &y, size_t &z, size_t &t, const size_t index, const size_t vol1, const size_t vol2, const size_t vol3) { @@ -426,20 +426,20 @@ class HaloIndexer { } - HOST_DEVICE inline static void + __host__ __device__ inline static void getHypPlanePos(size_t number, size_t &pos_a, size_t &pos_b) { pos_a = number * 2; pos_b = number * 2 + 1; } - HOST_DEVICE inline static void + __host__ __device__ inline static void getPlanePos(size_t number, size_t dir, size_t &pos_a, size_t &pos_b) { number -= 4; pos_a = 8 + number * 4 + dir; pos_b = 8 + number * 4 + dir + (3 - 2 * dir); } - HOST_DEVICE inline static void + __host__ __device__ inline static void getStripePos(size_t number, size_t dir, size_t &pos_a, size_t &pos_b) { number -= 10; @@ -447,7 +447,7 @@ class HaloIndexer { pos_b = 32 + number * 8 + dir + (7 - 2 * dir); } - HOST_DEVICE inline static void + __host__ __device__ inline static void getCornerPos(size_t number, size_t dir, size_t &pos_a, size_t &pos_b) { number -= 14; @@ -456,7 +456,7 @@ class HaloIndexer { } - HOST_DEVICE inline static HaloSegment mapIntToHSeg(int bits) { + __host__ __device__ inline static HaloSegment mapIntToHSeg(int bits) { if (bits == 1) return X; if (bits == 2) return Y; if (bits == 4) return Z; @@ -480,7 +480,7 @@ class HaloIndexer { return X; } - HOST_DEVICE inline static HaloSegment getHSeg(sitexyzt coord) { + __host__ __device__ inline static HaloSegment getHSeg(sitexyzt coord) { int bits = 0; @@ -499,7 +499,7 @@ class HaloIndexer { return mapIntToHSeg(bits); } - HOST_DEVICE inline static short getlr(sitexyzt coord) { + __host__ __device__ inline static short getlr(sitexyzt coord) { short lr = 0; HaloSegment hseg = getHSeg(coord); @@ -560,15 +560,15 @@ class HaloIndexer { } - HOST_DEVICE inline static size_t getOuterHaloSize() { + __host__ __device__ inline static size_t getOuterHaloSize() { return getHalData().getBufferSize(LatLayout); } - HOST_DEVICE inline static size_t getInnerHaloSize() { + __host__ __device__ inline static size_t getInnerHaloSize() { return getHalDataReduced().getBufferSize(LatLayout); } - HOST_DEVICE inline static size_t getCenterSize() { + __host__ __device__ inline static size_t getCenterSize() { return GIndexer::getLatData().vol4 - getInnerHaloSize(); } @@ -590,7 +590,7 @@ class HaloIndexer { /// |______________| /// - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerHaloCoord(size_t HalIndex, size_t &HalNumber, size_t &LocHalIndex) { @@ -685,7 +685,7 @@ class HaloIndexer { /// |______________| /// - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getOuterHaloCoord(size_t HalIndex, size_t &HalNumber, size_t &LocHalIndex) { HalNumber = _getHaloNumber(HalIndex, &LocHalIndex); @@ -782,7 +782,7 @@ class HaloIndexer { /// However if one does that by templating it, the compiler is not smart enough to optimize it away, /// so that this indexer become slower... - HOST_DEVICE inline static sitexyzt getInnerCoord(size_t HalIndex) { + __host__ __device__ inline static sitexyzt getInnerCoord(size_t HalIndex) { size_t HalNumber = 0, LocHalIndex = 0; HalNumber = _getHaloNumberReduced(HalIndex, &LocHalIndex); @@ -878,7 +878,7 @@ class HaloIndexer { /// +++++++++++++++++++++++++++ 8 HYPERPLANES +++++++++++++++++++++++++++++++ /// lr = 0,1 - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerHaloCoord_Hyperplane_X(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -892,7 +892,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerHaloCoord_Hyperplane_Y(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -906,7 +906,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerHaloCoord_Hyperplane_Z(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -920,7 +920,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerHaloCoord_Hyperplane_T(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -936,7 +936,7 @@ class HaloIndexer { /// +++++++++++++++++++++++++++++ 24 PLANES +++++++++++++++++++++++++++++++++ /// lr = 0-3 - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerHaloCoord_Plane_XY(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(z, t, x, y, LocHalIndex, getHalData().h_LZi, getHalData().h_ZT, getHalData().h_ZTH); @@ -951,7 +951,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerHaloCoord_Plane_XZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, t, x, z, LocHalIndex, getHalData().h_LYi, getHalData().h_YT, getHalData().h_YTH); @@ -965,7 +965,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerHaloCoord_Plane_XT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, z, x, t, LocHalIndex, getHalData().h_LYi, getHalData().h_YZ, getHalData().h_YZH); @@ -979,7 +979,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerHaloCoord_Plane_YZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, t, y, z, LocHalIndex, getHalData().h_LXi, getHalData().h_XT, getHalData().h_XTH); @@ -993,7 +993,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerHaloCoord_Plane_YT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, z, y, t, LocHalIndex, getHalData().h_LXi, getHalData().h_XZ, getHalData().h_XZH); @@ -1007,7 +1007,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerHaloCoord_Plane_ZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, y, z, t, LocHalIndex, getHalData().h_LXi, getHalData().h_XY, getHalData().h_XYH); @@ -1023,7 +1023,7 @@ class HaloIndexer { /// ++++++++++++++++++++++++++++ 32 STRIPES +++++++++++++++++++++++++++++++++ /// lr = 0-7 - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerHaloCoord_Stripe_XYZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(t, x, y, z, LocHalIndex, getHalData().h_LTi, getHalData().h_TH, getHalData().h_THH); @@ -1038,7 +1038,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerHaloCoord_Stripe_XYT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(z, x, y, t, LocHalIndex, getHalData().h_LZi, getHalData().h_ZH, getHalData().h_ZHH); @@ -1053,7 +1053,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerHaloCoord_Stripe_XZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, x, z, t, LocHalIndex, getHalData().h_LYi, getHalData().h_YH, getHalData().h_YHH); @@ -1068,7 +1068,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerHaloCoord_Stripe_YZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, y, z, t, LocHalIndex, getHalData().h_LXi, getHalData().h_XH, getHalData().h_XHH); @@ -1085,7 +1085,7 @@ class HaloIndexer { /// ++++++++++++++++++++++++++++ 16 CORNERS +++++++++++++++++++++++++++++++++ /// lr = 0-15 - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerHaloCoord_Corner(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1124,7 +1124,7 @@ class HaloIndexer { /// +++++++++++++++++++++++++++ 8 HYPERPLANES +++++++++++++++++++++++++++++++ /// lr = 0,1 - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getOuterHaloCoord_Hyperplane_X(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1137,7 +1137,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getOuterHaloCoord_Hyperplane_Y(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1150,7 +1150,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getOuterHaloCoord_Hyperplane_Z(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1163,7 +1163,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getOuterHaloCoord_Hyperplane_T(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1178,7 +1178,7 @@ class HaloIndexer { /// +++++++++++++++++++++++++++++ 24 PLANES +++++++++++++++++++++++++++++++++ /// lr = 0-3 - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getOuterHaloCoord_Plane_XY(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(z, t, x, y, LocHalIndex, getHalData().h_LZi, getHalData().h_ZT, getHalData().h_ZTH); @@ -1191,7 +1191,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getOuterHaloCoord_Plane_XZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, t, x, z, LocHalIndex, getHalData().h_LYi, getHalData().h_YT, getHalData().h_YTH); @@ -1203,7 +1203,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getOuterHaloCoord_Plane_XT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, z, x, t, LocHalIndex, getHalData().h_LYi, getHalData().h_YZ, getHalData().h_YZH); @@ -1215,7 +1215,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getOuterHaloCoord_Plane_YZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, t, y, z, LocHalIndex, getHalData().h_LXi, getHalData().h_XT, getHalData().h_XTH); @@ -1227,7 +1227,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getOuterHaloCoord_Plane_YT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, z, y, t, LocHalIndex, getHalData().h_LXi, getHalData().h_XZ, getHalData().h_XZH); @@ -1239,7 +1239,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getOuterHaloCoord_Plane_ZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, y, z, t, LocHalIndex, getHalData().h_LXi, getHalData().h_XY, getHalData().h_XYH); @@ -1253,7 +1253,7 @@ class HaloIndexer { /// ++++++++++++++++++++++++++++ 32 STRIPES +++++++++++++++++++++++++++++++++ /// lr = 0-7 - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getOuterHaloCoord_Stripe_XYZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(t, x, y, z, LocHalIndex, getHalData().h_LTi, getHalData().h_TH, getHalData().h_THH); @@ -1265,7 +1265,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getOuterHaloCoord_Stripe_XYT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(z, x, y, t, LocHalIndex, getHalData().h_LZi, getHalData().h_ZH, getHalData().h_ZHH); @@ -1277,7 +1277,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getOuterHaloCoord_Stripe_XZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, x, z, t, LocHalIndex, getHalData().h_LYi, getHalData().h_YH, getHalData().h_YHH); @@ -1289,7 +1289,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getOuterHaloCoord_Stripe_YZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, y, z, t, LocHalIndex, getHalData().h_LXi, getHalData().h_XH, getHalData().h_XHH); @@ -1303,7 +1303,7 @@ class HaloIndexer { /// ++++++++++++++++++++++++++++ 16 CORNERS +++++++++++++++++++++++++++++++++ /// lr = 0-15 - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getOuterHaloCoord_Corner(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1340,7 +1340,7 @@ class HaloIndexer { /// +++++++++++++++++++++++++++ 8 HYPERPLANES +++++++++++++++++++++++++++++++ /// lr = 0,1 - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerCoord_Hyperplane_X(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1354,7 +1354,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerCoord_Hyperplane_Y(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1368,7 +1368,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerCoord_Hyperplane_Z(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1382,7 +1382,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerCoord_Hyperplane_T(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1398,7 +1398,7 @@ class HaloIndexer { /// +++++++++++++++++++++++++++++ 24 PLANES +++++++++++++++++++++++++++++++++ /// lr = 0-3 - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerCoord_Plane_XY(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(z, t, x, y, LocHalIndex, getHalDataReduced().h_LZi, getHalDataReduced().h_ZT, @@ -1411,7 +1411,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerCoord_Plane_XZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, t, x, z, LocHalIndex, getHalDataReduced().h_LYi, getHalDataReduced().h_YT, @@ -1424,7 +1424,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerCoord_Plane_XT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, z, x, t, LocHalIndex, getHalDataReduced().h_LYi, getHalDataReduced().h_YZ, @@ -1437,7 +1437,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerCoord_Plane_YZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, t, y, z, LocHalIndex, getHalDataReduced().h_LXi, getHalDataReduced().h_XT, @@ -1450,7 +1450,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerCoord_Plane_YT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, z, y, t, LocHalIndex, getHalDataReduced().h_LXi, getHalDataReduced().h_XZ, @@ -1463,7 +1463,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerCoord_Plane_ZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, y, z, t, LocHalIndex, getHalDataReduced().h_LXi, getHalDataReduced().h_XY, @@ -1478,7 +1478,7 @@ class HaloIndexer { /// ++++++++++++++++++++++++++++ 32 STRIPES +++++++++++++++++++++++++++++++++ /// lr = 0-7 - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerCoord_Stripe_XYZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(t, x, y, z, LocHalIndex, getHalDataReduced().h_LTi, getHalDataReduced().h_TH, @@ -1491,7 +1491,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerCoord_Stripe_XYT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(z, x, y, t, LocHalIndex, getHalDataReduced().h_LZi, getHalDataReduced().h_ZH, @@ -1504,7 +1504,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerCoord_Stripe_XZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, x, z, t, LocHalIndex, getHalDataReduced().h_LYi, getHalDataReduced().h_YH, @@ -1517,7 +1517,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerCoord_Stripe_YZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, y, z, t, LocHalIndex, getHalDataReduced().h_LXi, getHalDataReduced().h_XH, @@ -1532,7 +1532,7 @@ class HaloIndexer { /// ++++++++++++++++++++++++++++ 16 CORNERS +++++++++++++++++++++++++++++++++ /// lr = 0-15 - HOST_DEVICE inline static sitexyzt + __host__ __device__ inline static sitexyzt getInnerCoord_Corner(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1566,7 +1566,7 @@ class HaloIndexer { /// - HOST_DEVICE inline static sitexyzt getCenterCoord(size_t CenterIndex) { + __host__ __device__ inline static sitexyzt getCenterCoord(size_t CenterIndex) { size_t x = 0, y = 0, z = 0, t = 0; diff --git a/src/base/math/correlators.h b/src/base/math/correlators.h index 4743261f..08574434 100644 --- a/src/base/math/correlators.h +++ b/src/base/math/correlators.h @@ -35,37 +35,37 @@ /// Initialize the correlator to zero, regardless of type. ---------------------------------- FUNCTIONS FOR CORRELATIONS template -HOST_DEVICE void inline initCorrToZero(int &corr) { +__host__ __device__ void inline initCorrToZero(int &corr) { corr = 0; } template -HOST_DEVICE void inline initCorrToZero(floatT &corr) { +__host__ __device__ void inline initCorrToZero(floatT &corr) { corr = 0.; } template -HOST_DEVICE void inline initCorrToZero(GSU3 &corr) { +__host__ __device__ void inline initCorrToZero(GSU3 &corr) { corr = gsu3_zero(); } template -HOST_DEVICE void inline initCorrToZero(GCOMPLEX(floatT) &corr) { +__host__ __device__ void inline initCorrToZero(GCOMPLEX(floatT) &corr) { corr = GPUcomplex(0., 0.); } /// Initialize the correlator to one, regardless of type. template -HOST_DEVICE void inline initCorrToOne(int &corr) { +__host__ __device__ void inline initCorrToOne(int &corr) { corr = 1; } template -HOST_DEVICE void inline initCorrToOne(floatT &corr) { +__host__ __device__ void inline initCorrToOne(floatT &corr) { corr = 1.; } template -HOST_DEVICE void inline initCorrToOne(GSU3 &corr) { +__host__ __device__ void inline initCorrToOne(GSU3 &corr) { corr = gsu3_one(); } template -HOST_DEVICE void inline initCorrToOne(GCOMPLEX(floatT) &corr) { +__host__ __device__ void inline initCorrToOne(GCOMPLEX(floatT) &corr) { corr = GPUcomplex(1., 0.); } @@ -229,14 +229,14 @@ class CorrelatorTools { void readNorm(std::string domain, Correlator &normalization, std::string normFileDir); /// Displacement vector de-indexing. - inline HOST_DEVICE void indexToSpaceTimeDisplacement(size_t dindex, int &dx, int &dy, int &dz, int &dt) { + inline __host__ __device__ void indexToSpaceTimeDisplacement(size_t dindex, int &dx, int &dy, int &dz, int &dt) { int rem2, rem1; divmod(dindex,svol3,dt,rem2); divmod(rem2 ,svol2,dz,rem1); divmod(rem1 ,svol1,dy,dx); } - inline HOST_DEVICE void indexToSpatialDisplacement(size_t dindex, int &dx, int &dy, int &dz) { + inline __host__ __device__ void indexToSpatialDisplacement(size_t dindex, int &dx, int &dy, int &dz) { int rem; divmod(dindex,svol2,dz,rem); divmod(rem ,svol1,dy,dx); @@ -375,7 +375,7 @@ class CorrelationDegeneracies : public LatticeContainer, public /// Trivial read index, in case you need/want to do indexing inside the Kernel. TODO: Probably should be in indexer? struct PassIndex { - inline HOST_DEVICE size_t operator()(const dim3& blockDim, const uint3& blockIdx, const uint3& threadIdx) { + inline __host__ __device__ size_t operator()(const dim3& blockDim, const uint3& blockIdx, const uint3& threadIdx) { return blockDim.x * blockIdx.x + threadIdx.x; } }; @@ -383,7 +383,7 @@ struct PassIndex { /// For fields that depend on x. template struct ReadIndexSpacetime { - inline HOST_DEVICE gSite operator()(const dim3& blockDim, const uint3& blockIdx, const uint3& threadIdx) { + inline __host__ __device__ gSite operator()(const dim3& blockDim, const uint3& blockIdx, const uint3& threadIdx) { size_t i = blockDim.x * blockIdx.x + threadIdx.x; typedef GIndexer GInd; gSite site = GInd::getSite(i); @@ -394,7 +394,7 @@ struct ReadIndexSpacetime { /// For fields that depend on spatial x. template struct ReadIndexSpatial { - inline HOST_DEVICE gSite operator()(const dim3& blockDim, const uint3& blockIdx, const uint3& threadIdx) { + inline __host__ __device__ gSite operator()(const dim3& blockDim, const uint3& blockIdx, const uint3& threadIdx) { size_t i = blockDim.x * blockIdx.x + threadIdx.x; typedef GIndexer GInd; gSite site = GInd::getSiteSpatial(i); @@ -416,13 +416,13 @@ struct ReadIndexSpatial { template class AxB { public: - HOST_DEVICE floatT inline orrelate(floatT A, floatT B) { + __host__ __device__ floatT inline orrelate(floatT A, floatT B) { return A*B; } - HOST_DEVICE GCOMPLEX(floatT) inline orrelate(GCOMPLEX(floatT) A, GCOMPLEX(floatT) B) { + __host__ __device__ GCOMPLEX(floatT) inline orrelate(GCOMPLEX(floatT) A, GCOMPLEX(floatT) B) { return A*B; } - HOST_DEVICE GCOMPLEX(floatT) inline orrelate(GSU3 A, GSU3 B) { + __host__ __device__ GCOMPLEX(floatT) inline orrelate(GSU3 A, GSU3 B) { return tr_c(A*B); } }; @@ -430,7 +430,7 @@ class AxB { template class trAxtrBt { public: - HOST_DEVICE GCOMPLEX(floatT) inline orrelate(GSU3 A, GSU3 B) { + __host__ __device__ GCOMPLEX(floatT) inline orrelate(GSU3 A, GSU3 B) { return tr_c(A)*tr_c(dagger(B)); } }; @@ -438,7 +438,7 @@ class trAxtrBt { template class trReAxtrReB { public: - HOST_DEVICE floatT inline orrelate(GSU3 A, GSU3 B) { + __host__ __device__ floatT inline orrelate(GSU3 A, GSU3 B) { return tr_d(A)*tr_d(B); } }; @@ -446,7 +446,7 @@ class trReAxtrReB { template class trImAxtrImB { public: - HOST_DEVICE floatT inline orrelate(GSU3 A, GSU3 B) { + __host__ __device__ floatT inline orrelate(GSU3 A, GSU3 B) { return tr_i(A)*tr_i(B); } }; @@ -454,7 +454,7 @@ class trImAxtrImB { template class trAxBt { public: - HOST_DEVICE GCOMPLEX(floatT) inline orrelate(GSU3 A, GSU3 B) { + __host__ __device__ GCOMPLEX(floatT) inline orrelate(GSU3 A, GSU3 B) { return tr_c(A*dagger(B)); } }; @@ -462,21 +462,21 @@ class trAxBt { template class polCorrAVG { public: - HOST_DEVICE floatT inline orrelate(GSU3 A, GSU3 B) { + __host__ __device__ floatT inline orrelate(GSU3 A, GSU3 B) { return real(tr_c(A)*tr_c(dagger(B)))/9.; } }; template class polCorrSIN { public: - HOST_DEVICE floatT inline orrelate(GSU3 A, GSU3 B) { + __host__ __device__ floatT inline orrelate(GSU3 A, GSU3 B) { return tr_d(A,dagger(B))/3.; } }; template class polCorrOCT { public: - HOST_DEVICE floatT inline orrelate(GSU3 A, GSU3 B) { + __host__ __device__ floatT inline orrelate(GSU3 A, GSU3 B) { floatT avg = real(tr_c(A)*tr_c(dagger(B))); floatT sin = tr_d(A,dagger(B)); return (0.125*avg - 0.04166666666*sin); @@ -499,7 +499,7 @@ struct SpacetimePairKernelSymm : CorrelatorTools { SpacetimePairKernelSymm(LatticeContainerAccessor field1, LatticeContainerAccessor field2, LatticeContainerAccessor field1Xfield2, size_t dindex) : _field1(field1), _field2(field2), _field1Xfield2(field1Xfield2), _dindex(dindex), CorrelatorTools() {} - HOST_DEVICE void operator()(gSite site) { + __host__ __device__ void operator()(gSite site) { typedef GIndexer GInd; size_t m, n; fieldType field1m, field2n; @@ -568,7 +568,7 @@ struct SpacetimePairKernel : CorrelatorTools { SpacetimePairKernel(LatticeContainerAccessor field1, LatticeContainerAccessor field2, LatticeContainerAccessor field1Xfield2, size_t dindex) : _field1(field1), _field2(field2), _field1Xfield2(field1Xfield2), _dindex(dindex), CorrelatorTools() {} - HOST_DEVICE void operator()(gSite site) { + __host__ __device__ void operator()(gSite site) { typedef GIndexer GInd; size_t m, n; fieldType field1m, field2n; @@ -670,7 +670,7 @@ struct SpatialPairKernelSymm : CorrelatorTools { SpatialPairKernelSymm(LatticeContainerAccessor field1, LatticeContainerAccessor field2, LatticeContainerAccessor field1Xfield2, size_t dindex) : _field1(field1), _field2(field2), _field1Xfield2(field1Xfield2), _dindex(dindex), CorrelatorTools() {} - HOST_DEVICE void operator()(gSite site) { + __host__ __device__ void operator()(gSite site) { typedef GIndexer GInd; size_t m, n; fieldType field1m, field2n; @@ -722,7 +722,7 @@ struct SpatialPairKernel : CorrelatorTools { SpatialPairKernel(LatticeContainerAccessor field1, LatticeContainerAccessor field2, LatticeContainerAccessor field1Xfield2, size_t dindex) : _field1(field1), _field2(field2), _field1Xfield2(field1Xfield2), _dindex(dindex), CorrelatorTools() {} - HOST_DEVICE void operator()(gSite site) { + __host__ __device__ void operator()(gSite site) { typedef GIndexer GInd; size_t m, n; fieldType field1m, field2n; @@ -797,7 +797,7 @@ struct RestrictedOffAxisKernel : CorrelatorTools { /// direction. (Backward correlations will be counted from the forward correlation of some other m.) A possible /// displacement is (1,0,0); therefore some on-axis correlations are computed already in the off-axis kernel. This /// is taken into account in the on-axis kernel. - HOST_DEVICE void operator()(size_t dindex) { /// dindex indexes displacement vector + __host__ __device__ void operator()(size_t dindex) { /// dindex indexes displacement vector typedef GIndexer GInd; size_t m,n1,n2,n3,n4; @@ -855,7 +855,7 @@ struct RestrictedOnAxisKernel : CorrelatorTools { : _field1(field1), _field2(field2), _field1Xfield2off(field1Xfield2off), _field1Xfield2on(field1Xfield2on), CorrelatorTools() {} - HOST_DEVICE void operator()(size_t dx){ /// Now dx corresponds to a separation, rather than a displacement + __host__ __device__ void operator()(size_t dx){ /// Now dx corresponds to a separation, rather than a displacement typedef GIndexer GInd; size_t m,n1,n2,n3; diff --git a/src/base/math/floatComparison.h b/src/base/math/floatComparison.h index 4233bfdf..4d492c93 100644 --- a/src/base/math/floatComparison.h +++ b/src/base/math/floatComparison.h @@ -16,7 +16,7 @@ /// This can be used on the GPU. template -HOST_DEVICE bool cmp_rel(const T a, const T b, const double rel, const double prec) { +__host__ __device__ bool cmp_rel(const T a, const T b, const double rel, const double prec) { if (abs(a-b) / abs(a+b) < rel && abs(a-b) < prec) { return true; } @@ -26,7 +26,7 @@ HOST_DEVICE bool cmp_rel(const T a, const T b, const double rel, const double pr /// Implements relative method - do not use for comparing with zero. Use this most of the time, tolerance needs to /// be meaningful in your context. template -HOST_DEVICE static bool isApproximatelyEqual(const TReal a, const TReal b, const TReal tolerance = std::numeric_limits::epsilon()) +__host__ __device__ static bool isApproximatelyEqual(const TReal a, const TReal b, const TReal tolerance = std::numeric_limits::epsilon()) { TReal diff = std::fabs(a - b); if (diff <= tolerance) diff --git a/src/base/math/gaugeAccessor.h b/src/base/math/gaugeAccessor.h index bf291048..3cae434c 100644 --- a/src/base/math/gaugeAccessor.h +++ b/src/base/math/gaugeAccessor.h @@ -9,7 +9,7 @@ #define BACKWARD_CONST 16 -HOST_DEVICE inline int Back(const int i) { +__host__ __device__ inline int Back(const int i) { return i + BACKWARD_CONST; } @@ -23,62 +23,62 @@ class gaugeAccessor : public GaugeConstructor { : GaugeConstructor(elements) {} /// Constructor for one memory chunk, where all entries are separated by object_count - HOST_DEVICE explicit gaugeAccessor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) + __host__ __device__ explicit gaugeAccessor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) : GaugeConstructor(elementsBase, object_count) {} explicit gaugeAccessor() : GaugeConstructor() {} template - HOST_DEVICE inline GSU3 getElement(const gSiteMu &siteMu) const { + __host__ __device__ inline GSU3 getElement(const gSiteMu &siteMu) const { return static_cast>(this->reconstruct(siteMu)); } template - HOST_DEVICE inline size_t getIndexComm(size_t isiteFull, size_t mu) const { + __host__ __device__ inline size_t getIndexComm(size_t isiteFull, size_t mu) const { gSiteMu siteMu = GIndexer::getSiteMuFull(isiteFull, mu); return siteMu.indexMuFull; } template - HOST_DEVICE inline GSU3 getElementComm(size_t isiteFull, size_t mu) const { + __host__ __device__ inline GSU3 getElementComm(size_t isiteFull, size_t mu) const { gSiteMu siteMu = GIndexer::getSiteMuFull(isiteFull, mu); return getElement(siteMu); } template - HOST_DEVICE inline void setElementComm(size_t isiteFull, const GSU3& mat) { + __host__ __device__ inline void setElementComm(size_t isiteFull, const GSU3& mat) { gSiteMu siteMu; siteMu.indexMuFull = isiteFull; setElement(siteMu, mat); } template - HOST_DEVICE inline void setElement(const gSiteMu &siteMu, const GSU3 &mat) { + __host__ __device__ inline void setElement(const gSiteMu &siteMu, const GSU3 &mat) { this->construct(siteMu, static_cast>(mat)); } template - HOST_DEVICE inline GSU3 getLink(const gSiteMu &siteMu) const { + __host__ __device__ inline GSU3 getLink(const gSiteMu &siteMu) const { return static_cast>(this->reconstruct(siteMu)); } template - HOST_DEVICE inline GSU3 getLinkDagger(const gSiteMu &siteMu) const { + __host__ __device__ inline GSU3 getLinkDagger(const gSiteMu &siteMu) const { return static_cast>(this->reconstructDagger(siteMu)); } template - HOST_DEVICE inline void setLink(const gSiteMu &siteMu, GSU3 mat) { + __host__ __device__ inline void setLink(const gSiteMu &siteMu, GSU3 mat) { this->construct(siteMu, static_cast>(mat)); } template - HOST_DEVICE inline GSU3 operator()(const gSiteMu &siteMu) const { + __host__ __device__ inline GSU3 operator()(const gSiteMu &siteMu) const { return static_cast>(this->reconstruct(siteMu)); } template - HOST_DEVICE inline GSU3 getLinkPath(gSite &site, int dir) const { + __host__ __device__ inline GSU3 getLinkPath(gSite &site, int dir) const { typedef GIndexer GInd; @@ -103,7 +103,7 @@ class gaugeAccessor : public GaugeConstructor { template - HOST_DEVICE inline GSU3 getLinkPath(gSite &site, int dir, Args... args) const { + __host__ __device__ inline GSU3 getLinkPath(gSite &site, int dir, Args... args) const { typedef GIndexer GInd; @@ -128,7 +128,7 @@ class gaugeAccessor : public GaugeConstructor { } template - HOST_DEVICE inline GSU3 getLinkPath(gSiteMu &siteMu, int dir, Args... args) const { + __host__ __device__ inline GSU3 getLinkPath(gSiteMu &siteMu, int dir, Args... args) const { typedef GIndexer GInd; gSite site = siteMu; diff --git a/src/base/math/gaugeConstructor.h b/src/base/math/gaugeConstructor.h index be1cfb1c..e7ddd6c6 100644 --- a/src/base/math/gaugeConstructor.h +++ b/src/base/math/gaugeConstructor.h @@ -31,7 +31,7 @@ struct GaugeConstructor : public GeneralAccessor::count >(elements) { } /// Constructor for one memory chunk, where all entries are separated by object_count - HOST_DEVICE explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) + __host__ __device__ explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) : GeneralAccessor::count >(elementsBase, object_count){ } explicit GaugeConstructor() : GeneralAccessor::count >(){ } @@ -44,12 +44,12 @@ struct GaugeConstructor : public GeneralAccessor::count]) : GeneralAccessor::count>(elements) {} - HOST_DEVICE explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) + __host__ __device__ explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) : GeneralAccessor::count >(elementsBase, object_count) {} explicit GaugeConstructor() : GeneralAccessor::count>() {} - HOST_DEVICE inline void setEntriesComm(GaugeConstructor &src_acc, + __host__ __device__ inline void setEntriesComm(GaugeConstructor &src_acc, size_t setIndex, size_t getIndex) { this->template setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); this->template setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); @@ -62,7 +62,7 @@ struct GaugeConstructor : public GeneralAccessortemplate setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); } - HOST_DEVICE inline void construct(const gSiteMu& idx, const GSU3 &mat) { + __host__ __device__ inline void construct(const gSiteMu& idx, const GSU3 &mat) { this->template setElementEntry(idx.indexMuFull, mat.getLink00()); this->template setElementEntry(idx.indexMuFull, mat.getLink01()); this->template setElementEntry(idx.indexMuFull, mat.getLink02()); @@ -74,7 +74,7 @@ struct GaugeConstructor : public GeneralAccessortemplate setElementEntry(idx.indexMuFull, mat.getLink22()); } - HOST_DEVICE inline GSU3 reconstruct(const gSiteMu& idx) const { + __host__ __device__ inline GSU3 reconstruct(const gSiteMu& idx) const { GSU3 ret( this->template getElementEntry(idx.indexMuFull), this->template getElementEntry(idx.indexMuFull), @@ -88,7 +88,7 @@ struct GaugeConstructor : public GeneralAccessor reconstructDagger(const gSiteMu& idx) const { + __host__ __device__ inline GSU3 reconstructDagger(const gSiteMu& idx) const { return GSU3(conj(this->template getElementEntry(idx.indexMuFull)), conj(this->template getElementEntry(idx.indexMuFull)), conj(this->template getElementEntry(idx.indexMuFull)), @@ -108,13 +108,13 @@ struct GaugeConstructor : public GeneralAccessor::count]) : GeneralAccessor::count >(elements) { } - HOST_DEVICE explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) + __host__ __device__ explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) : GeneralAccessor::count >(elementsBase, object_count){ } explicit GaugeConstructor() : GeneralAccessor::count >(){ } - HOST_DEVICE inline void setEntriesComm(GaugeConstructor &src_acc, + __host__ __device__ inline void setEntriesComm(GaugeConstructor &src_acc, size_t setIndex, size_t getIndex) { this->template setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); this->template setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); @@ -125,7 +125,7 @@ struct GaugeConstructor : public GeneralAccessortemplate setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); } - HOST_DEVICE inline void construct(const gSiteMu& idx, const GSU3 &mat) { + __host__ __device__ inline void construct(const gSiteMu& idx, const GSU3 &mat) { this->template setElementEntry(idx.indexMuFull, mat.getLink00()); this->template setElementEntry(idx.indexMuFull, mat.getLink01()); this->template setElementEntry(idx.indexMuFull, mat.getLink02()); @@ -135,7 +135,7 @@ struct GaugeConstructor : public GeneralAccessortemplate setElementEntry(idx.indexMuFull, det(mat)); } - HOST_DEVICE inline GSU3 reconstruct(const gSiteMu& idx) const { + __host__ __device__ inline GSU3 reconstruct(const gSiteMu& idx) const { GSU3 ret( this->template getElementEntry(idx.indexMuFull), this->template getElementEntry(idx.indexMuFull), @@ -150,7 +150,7 @@ struct GaugeConstructor : public GeneralAccessor reconstructDagger(const gSiteMu& idx) const { + __host__ __device__ inline GSU3 reconstructDagger(const gSiteMu& idx) const { GSU3 tmp = GSU3( conj(this->template getElementEntry(idx.indexMuFull)), conj(this->template getElementEntry(idx.indexMuFull)), @@ -175,12 +175,12 @@ struct GaugeConstructor : public GeneralAccessor::count]) : GeneralAccessor::count >(elements) { } - HOST_DEVICE explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) + __host__ __device__ explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) : GeneralAccessor::count >(elementsBase, object_count) { } explicit GaugeConstructor() : GeneralAccessor::count>() { } - HOST_DEVICE inline void setEntriesComm(GaugeConstructor &src_acc, + __host__ __device__ inline void setEntriesComm(GaugeConstructor &src_acc, size_t setIndex, size_t getIndex) { this->template setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); this->template setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); @@ -191,7 +191,7 @@ struct GaugeConstructor : public GeneralAccessortemplate setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); } - HOST_DEVICE inline void construct(const gSiteMu& idx, const GSU3 &mat) { + __host__ __device__ inline void construct(const gSiteMu& idx, const GSU3 &mat) { this->template setElementEntry(idx.indexMuFull, mat.getLink00()); this->template setElementEntry(idx.indexMuFull, mat.getLink01()); this->template setElementEntry(idx.indexMuFull, mat.getLink02()); @@ -201,7 +201,7 @@ struct GaugeConstructor : public GeneralAccessortemplate setElementEntry(idx.indexMuFull, det(mat)); } - HOST_DEVICE inline GSU3 reconstruct(const gSiteMu& idx) const { + __host__ __device__ inline GSU3 reconstruct(const gSiteMu& idx) const { GSU3 ret( this->template getElementEntry(idx.indexMuFull), this->template getElementEntry(idx.indexMuFull), @@ -216,7 +216,7 @@ struct GaugeConstructor : public GeneralAccessor reconstructDagger(const gSiteMu& idx) const { + __host__ __device__ inline GSU3 reconstructDagger(const gSiteMu& idx) const { GSU3 tmp = GSU3( conj(this->template getElementEntry(idx.indexMuFull)), conj(this->template getElementEntry(idx.indexMuFull)), @@ -240,12 +240,12 @@ struct GaugeConstructor : public GeneralAccessor::count]) : GeneralAccessor::count >(elements) { } - HOST_DEVICE explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) + __host__ __device__ explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) : GeneralAccessor::count>(elementsBase, object_count) { } explicit GaugeConstructor() : GeneralAccessor::count>() { } - HOST_DEVICE inline void setEntriesComm(GaugeConstructor &src_acc, + __host__ __device__ inline void setEntriesComm(GaugeConstructor &src_acc, size_t setIndex, size_t getIndex) { this->template setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); this->template setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); @@ -255,7 +255,7 @@ struct GaugeConstructor : public GeneralAccessortemplate setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); } - HOST_DEVICE inline void construct(const gSiteMu& idx, const GSU3 &mat) { + __host__ __device__ inline void construct(const gSiteMu& idx, const GSU3 &mat) { this->template setElementEntry(idx.indexMuFull, mat.getLink00()); this->template setElementEntry(idx.indexMuFull, mat.getLink01()); this->template setElementEntry(idx.indexMuFull, mat.getLink02()); @@ -264,7 +264,7 @@ struct GaugeConstructor : public GeneralAccessortemplate setElementEntry(idx.indexMuFull, mat.getLink12()); } - HOST_DEVICE inline GSU3 reconstruct(const gSiteMu& idx) const { + __host__ __device__ inline GSU3 reconstruct(const gSiteMu& idx) const { GSU3 ret( this->template getElementEntry(idx.indexMuFull), this->template getElementEntry(idx.indexMuFull), @@ -278,7 +278,7 @@ struct GaugeConstructor : public GeneralAccessor reconstructDagger(const gSiteMu& idx) const { + __host__ __device__ inline GSU3 reconstructDagger(const gSiteMu& idx) const { GSU3 tmp = GSU3( conj(this->template getElementEntry(idx.indexMuFull)), conj(this->template getElementEntry(idx.indexMuFull)), @@ -309,12 +309,12 @@ struct GaugeConstructor : public GeneralAccessor::count>(elements) { throw std::runtime_error(stdLogger.fatal("STAGG_R12 should not be used at the moment")); } - HOST_DEVICE explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) + __host__ __device__ explicit GaugeConstructor(GCOMPLEX(floatT_memory) *elementsBase, size_t object_count) : GeneralAccessor::count >(elementsBase, object_count) { } explicit GaugeConstructor() : GeneralAccessor::count>() { } - HOST_DEVICE inline void setEntriesComm(GaugeConstructor &src_acc, + __host__ __device__ inline void setEntriesComm(GaugeConstructor &src_acc, size_t setIndex, size_t getIndex) { this->template setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); this->template setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); @@ -324,7 +324,7 @@ struct GaugeConstructor : public GeneralAccessortemplate setElementEntry(setIndex, src_acc.template getElementEntry(getIndex)); } - HOST_DEVICE inline void construct(const gSiteMu& idx, const GSU3 &mat) { + __host__ __device__ inline void construct(const gSiteMu& idx, const GSU3 &mat) { this->template setElementEntry(idx.indexMuFull, mat.getLink00()); this->template setElementEntry(idx.indexMuFull, mat.getLink01()); this->template setElementEntry(idx.indexMuFull, mat.getLink02()); @@ -333,7 +333,7 @@ struct GaugeConstructor : public GeneralAccessortemplate setElementEntry(idx.indexMuFull, mat.getLink12()); } - HOST_DEVICE inline GSU3 reconstruct(const gSiteMu& idx) const { + __host__ __device__ inline GSU3 reconstruct(const gSiteMu& idx) const { GSU3 ret( this->template getElementEntry(idx.indexMuFull), this->template getElementEntry(idx.indexMuFull), @@ -348,7 +348,7 @@ struct GaugeConstructor : public GeneralAccessor reconstructDagger(const gSiteMu& idx) const { + __host__ __device__ inline GSU3 reconstructDagger(const gSiteMu& idx) const { GSU3 tmp = GSU3( conj(this->template getElementEntry(idx.indexMuFull)), conj(this->template getElementEntry(idx.indexMuFull)), diff --git a/src/base/math/gcomplex.h b/src/base/math/gcomplex.h index aad25f6c..53571d34 100644 --- a/src/base/math/gcomplex.h +++ b/src/base/math/gcomplex.h @@ -56,13 +56,13 @@ class GPUcomplex { /** * Default constructor, leave values uninitialized. */ - HOST_DEVICE GPUcomplex(){}; + __host__ __device__ GPUcomplex(){}; constexpr GPUcomplex(const GPUcomplex &) = default; /** * Utility constructor, creates class from given real and imaginary value */ - HOST_DEVICE GPUcomplex(const floatT &real, const floatT &imag) { + __host__ __device__ GPUcomplex(const floatT &real, const floatT &imag) { cREAL = real; cIMAG = imag; }; @@ -71,72 +71,72 @@ class GPUcomplex { * Utility constructor, creates class from real value, assumes imaginary value * to be zero. */ - HOST_DEVICE GPUcomplex(const floatT &real) { + __host__ __device__ GPUcomplex(const floatT &real) { cREAL = real; cIMAG = 0.0f; }; - SQCD_HOST GPUcomplex(const std::complex &orig) { + __host__ GPUcomplex(const std::complex &orig) { cREAL = std::real(orig); cIMAG = std::imag(orig); } - SQCD_HOST GPUcomplex(const std::complex &orig) { + __host__ GPUcomplex(const std::complex &orig) { cREAL = std::real(orig); cIMAG = std::imag(orig); } - HOST_DEVICE GPUcomplex &operator=(const GPUcomplex &orig) { + __host__ __device__ GPUcomplex &operator=(const GPUcomplex &orig) { this->c = static_cast(orig.c); return *this; } - HOST_DEVICE GPUcomplex &operator=(const GPUcomplex &orig) { + __host__ __device__ GPUcomplex &operator=(const GPUcomplex &orig) { this->c = static_cast(orig.c); return *this; } - HOST_DEVICE GPUcomplex &operator=(const floatT &orig) { + __host__ __device__ GPUcomplex &operator=(const floatT &orig) { this->cREAL = orig; this->cIMAG = 0.0f; return *this; } - HOST_DEVICE GPUcomplex &operator+=(const GPUcomplex &op) { + __host__ __device__ GPUcomplex &operator+=(const GPUcomplex &op) { this->cREAL += op.cREAL; this->cIMAG += op.cIMAG; return *this; } - HOST_DEVICE GPUcomplex &operator+=(const floatT &op) { + __host__ __device__ GPUcomplex &operator+=(const floatT &op) { this->cREAL += op; return *this; } - HOST_DEVICE GPUcomplex &operator-=(const GPUcomplex &op) { + __host__ __device__ GPUcomplex &operator-=(const GPUcomplex &op) { this->cREAL -= op.cREAL; this->cIMAG -= op.cIMAG; return *this; } - HOST_DEVICE GPUcomplex &operator-=(const floatT &op) { + __host__ __device__ GPUcomplex &operator-=(const floatT &op) { this->cREAL -= op; return *this; } - HOST_DEVICE GPUcomplex &operator*=(const GPUcomplex &op) { + __host__ __device__ GPUcomplex &operator*=(const GPUcomplex &op) { floatT newReal = this->cREAL * op.cREAL - this->cIMAG * op.cIMAG; this->cIMAG = this->cREAL * op.cIMAG + this->cIMAG * op.cREAL; this->cREAL = newReal; return *this; } - HOST_DEVICE GPUcomplex &operator*=(const floatT &op) { + __host__ __device__ GPUcomplex &operator*=(const floatT &op) { this->cREAL *= op; this->cIMAG *= op; return *this; } - HOST_DEVICE GPUcomplex &operator/=(const floatT &op) { + __host__ __device__ GPUcomplex &operator/=(const floatT &op) { this->cREAL /= op; this->cIMAG /= op; return *this; @@ -144,7 +144,7 @@ class GPUcomplex { /// Note: You should not use this operator to compare with zero, because /// cmp_rel breaks down in that case. - HOST_DEVICE bool operator==(const GPUcomplex &op) { + __host__ __device__ bool operator==(const GPUcomplex &op) { ////TODO:: THAT PRECISION HAS TO BE CHANGED!! return (cmp_rel(this->cREAL, op.cREAL, 1.e-6, 1.e-6) && cmp_rel(this->cIMAG, op.cIMAG, 1.e-6, 1.e-6)); @@ -152,58 +152,58 @@ class GPUcomplex { //isApproximatelyEqual(this->cIMAG, op.cIMAG, 1.e-14)); } - HOST_DEVICE friend GPUcomplex operator+(const GPUcomplex &left, + __host__ __device__ friend GPUcomplex operator+(const GPUcomplex &left, const GPUcomplex &right) { return GPUcomplex(left.cREAL + right.cREAL, left.cIMAG + right.cIMAG); } - HOST_DEVICE friend GPUcomplex operator+(const GPUcomplex &left, + __host__ __device__ friend GPUcomplex operator+(const GPUcomplex &left, const floatT &right) { return GPUcomplex(left.cREAL + right, left.cIMAG); } - HOST_DEVICE friend GPUcomplex operator+(const floatT &left, + __host__ __device__ friend GPUcomplex operator+(const floatT &left, const GPUcomplex &right) { return GPUcomplex(left + right.cREAL, right.cIMAG); } - HOST_DEVICE friend GPUcomplex operator-(const GPUcomplex &op) { + __host__ __device__ friend GPUcomplex operator-(const GPUcomplex &op) { return GPUcomplex(-op.cREAL, -op.cIMAG); } - HOST_DEVICE friend GPUcomplex operator-(const GPUcomplex &left, + __host__ __device__ friend GPUcomplex operator-(const GPUcomplex &left, const GPUcomplex &right) { return GPUcomplex(left.cREAL - right.cREAL, left.cIMAG - right.cIMAG); } - HOST_DEVICE friend GPUcomplex operator-(const GPUcomplex &left, + __host__ __device__ friend GPUcomplex operator-(const GPUcomplex &left, const floatT &right) { return GPUcomplex(left.cREAL - right, left.cIMAG); } - HOST_DEVICE friend GPUcomplex operator-(const floatT &left, + __host__ __device__ friend GPUcomplex operator-(const floatT &left, const GPUcomplex &right) { return GPUcomplex(left - right.cREAL, -right.cIMAG); } - HOST_DEVICE friend GPUcomplex operator*(const GPUcomplex &left, + __host__ __device__ friend GPUcomplex operator*(const GPUcomplex &left, const GPUcomplex &right) { floatT newReal = left.cREAL * right.cREAL - left.cIMAG * right.cIMAG; floatT newImag = left.cREAL * right.cIMAG + left.cIMAG * right.cREAL; return GPUcomplex(newReal, newImag); } - HOST_DEVICE friend GPUcomplex operator*(const GPUcomplex &left, + __host__ __device__ friend GPUcomplex operator*(const GPUcomplex &left, const floatT &right) { return GPUcomplex(left.cREAL * right, left.cIMAG * right); } - HOST_DEVICE friend GPUcomplex operator*(const floatT &left, + __host__ __device__ friend GPUcomplex operator*(const floatT &left, const GPUcomplex &right) { return GPUcomplex(left * right.cREAL, left * right.cIMAG); } - HOST_DEVICE friend GPUcomplex + __host__ __device__ friend GPUcomplex fma(const GPUcomplex &x, const GPUcomplex &y, const GPUcomplex &d) { floatT real_res; floatT imag_res; @@ -217,7 +217,7 @@ class GPUcomplex { return GPUcomplex(real_res, imag_res); } - HOST_DEVICE friend GPUcomplex fma(const floatT x, const GPUcomplex &y, + __host__ __device__ friend GPUcomplex fma(const floatT x, const GPUcomplex &y, const GPUcomplex &d) { floatT real_res; floatT imag_res; @@ -228,7 +228,7 @@ class GPUcomplex { return GPUcomplex(real_res, imag_res); } - HOST_DEVICE void addProduct(const GPUcomplex &x, + __host__ __device__ void addProduct(const GPUcomplex &x, const GPUcomplex &y) { this->cREAL = (x.cREAL * y.cREAL) + this->cREAL; this->cIMAG = (x.cREAL * y.cIMAG) + this->cIMAG; @@ -239,7 +239,7 @@ class GPUcomplex { return; } - HOST_DEVICE void addProduct(const floatT &x, const GPUcomplex &y) { + __host__ __device__ void addProduct(const floatT &x, const GPUcomplex &y) { this->cREAL = (x * y.cREAL) + this->cREAL; this->cIMAG = (x * y.cIMAG) + this->cIMAG; @@ -247,27 +247,27 @@ class GPUcomplex { } template - HOST_DEVICE friend GPUcomplex operator/(const GPUcomplex &left, + __host__ __device__ friend GPUcomplex operator/(const GPUcomplex &left, const T &right) { return GPUcomplex(left.cREAL / right, left.cIMAG / right); } template - HOST_DEVICE friend GPUcomplex operator/(const T &left, + __host__ __device__ friend GPUcomplex operator/(const T &left, const GPUcomplex &right) { return GPUcomplex( left * right.cREAL / (right.cREAL * right.cREAL + right.cIMAG * right.cIMAG), -left * right.cIMAG / (right.cREAL * right.cREAL + right.cIMAG * right.cIMAG)); } - HOST_DEVICE inline static GPUcomplex invalid(); + __host__ __device__ inline static GPUcomplex invalid(); // These are needed to make sure that dp_complex may be part in general // operators src/math/operators.h - HOST_DEVICE GPUcomplex getAccessor() const { return *this; } + __host__ __device__ GPUcomplex getAccessor() const { return *this; } template - HOST_DEVICE GPUcomplex operator()(const Index) const { + __host__ __device__ GPUcomplex operator()(const Index) const { return *this; } }; @@ -277,56 +277,56 @@ class GPUcomplex { template <> class GPUcomplex<__half> { public: __half2 c; - HOST_DEVICE GPUcomplex(){}; + __host__ __device__ GPUcomplex(){}; - HOST_DEVICE GPUcomplex(const __half &real, const __half &imag) { + __host__ __device__ GPUcomplex(const __half &real, const __half &imag) { cREAL = real; cIMAG = imag; }; - HOST_DEVICE GPUcomplex(const __half &real) { + __host__ __device__ GPUcomplex(const __half &real) { cREAL = real; cIMAG = __float2half(0.0f); }; - HOST_DEVICE GPUcomplex(const __half2 &vec_type) { c = vec_type; }; + __host__ __device__ GPUcomplex(const __half2 &vec_type) { c = vec_type; }; - HOST_DEVICE GPUcomplex &operator=(const GPUcomplex &orig) { + __host__ __device__ GPUcomplex &operator=(const GPUcomplex &orig) { this->c = __float22half2_rn(orig.c); return *this; } - HOST_DEVICE GPUcomplex &operator=(const GPUcomplex &orig) { + __host__ __device__ GPUcomplex &operator=(const GPUcomplex &orig) { __half realpart = __double2half(orig.cREAL); __half imagpart = __double2half(orig.cIMAG); this->c = __halves2half2(realpart, imagpart); return *this; } - HOST_DEVICE GPUcomplex &operator=(const GPUcomplex<__half> orig) { + __host__ __device__ GPUcomplex &operator=(const GPUcomplex<__half> orig) { this->c = static_cast<__half2>(orig.c); return *this; } - HOST_DEVICE GPUcomplex &operator=(const __half &orig) { + __host__ __device__ GPUcomplex &operator=(const __half &orig) { this->cREAL = orig; this->cIMAG = 0.0f; return *this; } - HOST_DEVICE GPUcomplex &operator+=(const __half &op) { + __host__ __device__ GPUcomplex &operator+=(const __half &op) { this->cREAL += op; return *this; } - HOST_DEVICE GPUcomplex &operator+=(const GPUcomplex &op) { + __host__ __device__ GPUcomplex &operator+=(const GPUcomplex &op) { this->c += op.c; return *this; } - HOST_DEVICE GPUcomplex &operator-=(const GPUcomplex &op) { + __host__ __device__ GPUcomplex &operator-=(const GPUcomplex &op) { this->c -= op.c; return *this; } - HOST_DEVICE GPUcomplex &operator*=(const GPUcomplex &op) { + __host__ __device__ GPUcomplex &operator*=(const GPUcomplex &op) { const __half2 a_re = __half2half2(this->cREAL); __half2 acc = __hfma2(a_re, op.c, __float2half2_rn(0.0)); @@ -339,53 +339,53 @@ template <> class GPUcomplex<__half> { return *this; } - HOST_DEVICE GPUcomplex &operator*=(const __half &op) { + __host__ __device__ GPUcomplex &operator*=(const __half &op) { __half2 temp = __half2half2(op); this->c *= temp; return *this; } - HOST_DEVICE GPUcomplex &operator/=(const __half &op) { + __host__ __device__ GPUcomplex &operator/=(const __half &op) { __half2 temp = __half2half2(op); this->c /= temp; return *this; } - HOST_DEVICE friend GPUcomplex operator+(const GPUcomplex left, + __host__ __device__ friend GPUcomplex operator+(const GPUcomplex left, const GPUcomplex right) { return GPUcomplex(left.c + right.c); } - HOST_DEVICE friend GPUcomplex operator+(const GPUcomplex &left, + __host__ __device__ friend GPUcomplex operator+(const GPUcomplex &left, const __half &right) { return GPUcomplex(left.cREAL + right, left.cIMAG); } - HOST_DEVICE friend GPUcomplex operator+(const __half &left, + __host__ __device__ friend GPUcomplex operator+(const __half &left, const GPUcomplex &right) { return GPUcomplex(left + right.cREAL, right.cIMAG); } - HOST_DEVICE friend GPUcomplex operator-(const GPUcomplex &op) { + __host__ __device__ friend GPUcomplex operator-(const GPUcomplex &op) { return GPUcomplex(-op.c); } - HOST_DEVICE friend GPUcomplex operator-(const GPUcomplex &left, + __host__ __device__ friend GPUcomplex operator-(const GPUcomplex &left, const GPUcomplex &right) { return GPUcomplex(left.c - right.c); } - HOST_DEVICE friend GPUcomplex operator-(const GPUcomplex &left, + __host__ __device__ friend GPUcomplex operator-(const GPUcomplex &left, const __half &right) { return GPUcomplex(left.cREAL - right, left.cIMAG); } - HOST_DEVICE friend GPUcomplex operator-(const __half &left, + __host__ __device__ friend GPUcomplex operator-(const __half &left, const GPUcomplex &right) { return GPUcomplex(left - right.cREAL, -right.cIMAG); } - HOST_DEVICE friend GPUcomplex operator*(const GPUcomplex &left, + __host__ __device__ friend GPUcomplex operator*(const GPUcomplex &left, const GPUcomplex &right) { const __half2 a_re = __half2half2(left.cREAL); @@ -399,17 +399,17 @@ template <> class GPUcomplex<__half> { return GPUcomplex(acc); } - HOST_DEVICE friend GPUcomplex operator*(const GPUcomplex &left, + __host__ __device__ friend GPUcomplex operator*(const GPUcomplex &left, const __half &right) { return GPUcomplex(left.cREAL * right, left.cIMAG * right); } - HOST_DEVICE friend GPUcomplex operator*(const __half &left, + __host__ __device__ friend GPUcomplex operator*(const __half &left, const GPUcomplex &right) { return GPUcomplex(left * right.cREAL, left * right.cIMAG); } - HOST_DEVICE friend GPUcomplex + __host__ __device__ friend GPUcomplex fma(const GPUcomplex &a, const GPUcomplex &b, const GPUcomplex &d) { const __half2 a_re = __half2half2(a.cREAL); __half2 acc = __hfma2(a_re, b.c, d.c); @@ -420,13 +420,13 @@ template <> class GPUcomplex<__half> { return GPUcomplex(acc); } - HOST_DEVICE friend GPUcomplex fma(const __half x, const GPUcomplex &y, + __host__ __device__ friend GPUcomplex fma(const __half x, const GPUcomplex &y, const GPUcomplex &d) { __half2 xh2 = __half2half2(x); return GPUcomplex(__hfma2(xh2, y.c, d.c)); } - HOST_DEVICE void addProduct(const GPUcomplex &a, + __host__ __device__ void addProduct(const GPUcomplex &a, const GPUcomplex &b) { const __half2 a_re = __half2half2(a.cREAL); __half2 acc = __hfma2(a_re, b.c, this->c); @@ -438,19 +438,19 @@ template <> class GPUcomplex<__half> { return; } - HOST_DEVICE void addProduct(const __half &x, const GPUcomplex &y) { + __host__ __device__ void addProduct(const __half &x, const GPUcomplex &y) { __half2 xh2 = __half2half2(x); this->c = __hfma2(xh2, y.c, this->c); return; } template - HOST_DEVICE friend GPUcomplex operator/(const GPUcomplex &left, + __host__ __device__ friend GPUcomplex operator/(const GPUcomplex &left, const T &right) { return GPUcomplex(left.cREAL / right, left.cIMAG / right); } - HOST_DEVICE friend GPUcomplex operator/(const GPUcomplex &left, + __host__ __device__ friend GPUcomplex operator/(const GPUcomplex &left, const __half &right) { __half2 right2 = __half2half2(right); @@ -458,19 +458,19 @@ template <> class GPUcomplex<__half> { } template - HOST_DEVICE friend GPUcomplex operator/(const T &left, + __host__ __device__ friend GPUcomplex operator/(const T &left, const GPUcomplex &right) { return GPUcomplex( left * right.cREAL / (right.cREAL * right.cREAL + right.cIMAG * right.cIMAG), -left * right.cIMAG / (right.cREAL * right.cREAL + right.cIMAG * right.cIMAG)); } - HOST_DEVICE inline static GPUcomplex invalid(); + __host__ __device__ inline static GPUcomplex invalid(); - HOST_DEVICE GPUcomplex getAccessor() const { return *this; } + __host__ __device__ GPUcomplex getAccessor() const { return *this; } template - HOST_DEVICE GPUcomplex operator()(const Index) const { + __host__ __device__ GPUcomplex operator()(const Index) const { return *this; } }; @@ -478,46 +478,46 @@ template <> class GPUcomplex<__half> { #endif template -HOST_DEVICE inline floatT real(const GPUcomplex &op) { +__host__ __device__ inline floatT real(const GPUcomplex &op) { return op.cREAL; } template -HOST_DEVICE inline floatT imag(const GPUcomplex &op) { +__host__ __device__ inline floatT imag(const GPUcomplex &op) { return op.cIMAG; } template -HOST_DEVICE inline floatT abs(const GPUcomplex &op) { +__host__ __device__ inline floatT abs(const GPUcomplex &op) { floatT square = op.cREAL * op.cREAL + op.cIMAG * op.cIMAG; return sqrtf(square); } template -HOST_DEVICE inline floatT abs2(const GPUcomplex &op) { +__host__ __device__ inline floatT abs2(const GPUcomplex &op) { return op.cREAL * op.cREAL + op.cIMAG * op.cIMAG; } template -HOST_DEVICE inline GPUcomplex +__host__ __device__ inline GPUcomplex conj(const GPUcomplex &op) { return GPUcomplex(op.cREAL, -op.cIMAG); } template -HOST_DEVICE inline floatT arg(const GPUcomplex &op) { +__host__ __device__ inline floatT arg(const GPUcomplex &op) { return atan2(op.cIMAG, op.cREAL); } template -HOST_DEVICE inline GPUcomplex +__host__ __device__ inline GPUcomplex cupow(const GPUcomplex &base, const floatT &exp) { return GPUcomplex(pow(abs(base), exp) * cos(arg(base) * exp), pow(abs(base), exp) * sin(arg(base) * exp)); } template -HOST_DEVICE inline GPUcomplex +__host__ __device__ inline GPUcomplex cusqrt(const GPUcomplex &base) { return GPUcomplex(sqrt(abs(base)) * cos(arg(base) * 0.5), sqrt(abs(base)) * sin(arg(base) * 0.5)); @@ -527,19 +527,19 @@ template const GPUcomplex GPUcomplex_invalid(nanf(" "), nanf(" ")); template -SQCD_HOST inline std::ostream &operator<<(std::ostream &s, +__host__ inline std::ostream &operator<<(std::ostream &s, GPUcomplex z) { return s << '(' << real(z) << ',' << imag(z) << ')'; } template -HOST_DEVICE inline GPUcomplex +__host__ __device__ inline GPUcomplex GPUcomplex::invalid() { return GPUcomplex_invalid; } template -HOST_DEVICE inline bool +__host__ __device__ inline bool compareGCOMPLEX(GPUcomplex a, GPUcomplex b, floatT tol) { floatT diffRe = abs(real(a) - real(b)); floatT diffIm = abs(imag(a) - imag(b)); diff --git a/src/base/math/generalAccessor.h b/src/base/math/generalAccessor.h index 7334f1b2..af21198b 100644 --- a/src/base/math/generalAccessor.h +++ b/src/base/math/generalAccessor.h @@ -30,12 +30,12 @@ class GeneralAccessor { public: template - HOST_DEVICE inline object_memory getElementEntry(const size_t idx) const { + __host__ __device__ inline object_memory getElementEntry(const size_t idx) const { return (_elements[elem][idx]); } template - HOST_DEVICE inline void setElementEntry(const size_t idx, object_memory entry) { + __host__ __device__ inline void setElementEntry(const size_t idx, object_memory entry) { _elements[elem][idx] = static_cast(entry); } @@ -46,7 +46,7 @@ class GeneralAccessor { } /// Constructor for one memory chunk, where all entries are separated by object_count - HOST_DEVICE explicit GeneralAccessor(object_memory *elementsBase, size_t object_count) { + __host__ __device__ explicit GeneralAccessor(object_memory *elementsBase, size_t object_count) { for (size_t i = 0; i < Nentries; i++) { _elements[i] = elementsBase + i * object_count; } diff --git a/src/base/math/grnd.cpp b/src/base/math/grnd.cpp index b8a0f815..7df5e1de 100644 --- a/src/base/math/grnd.cpp +++ b/src/base/math/grnd.cpp @@ -151,7 +151,7 @@ void grnd_state::make_rng_state(unsigned int seed){ template -HOST_DEVICE uint4* grnd_state::getElement(gSite site){ +__host__ __device__ uint4* grnd_state::getElement(gSite site){ return &state[site.isite]; } diff --git a/src/base/math/grnd.h b/src/base/math/grnd.h index 2711b58c..79d0b69e 100644 --- a/src/base/math/grnd.h +++ b/src/base/math/grnd.h @@ -33,20 +33,20 @@ -template HOST_DEVICE inline floatT minVal(); -template<> HOST_DEVICE inline float minVal(){ return FLT_MIN; } -template<> HOST_DEVICE inline double minVal(){ return DBL_MIN; } +template __host__ __device__ inline floatT minVal(); +template<> __host__ __device__ inline float minVal(){ return FLT_MIN; } +template<> __host__ __device__ inline double minVal(){ return DBL_MIN; } /** * internal functions, should only be called from get_rand! */ -HOST_DEVICE inline unsigned taus_step( unsigned &z, int S1, int S2, int S3, unsigned M) +__host__ __device__ inline unsigned taus_step( unsigned &z, int S1, int S2, int S3, unsigned M) { unsigned b=((((z<>S2); return z=((((z &M)< -HOST_DEVICE inline floatT get_rand(uint4* state) +__host__ __device__ inline floatT get_rand(uint4* state) { return 2.3283064365386963e-10*( taus_step( state->x, 13, 19, 12, 4294967294ul)^ taus_step( state->y, 2, 25, 4, 4294967288ul)^ @@ -64,7 +64,7 @@ HOST_DEVICE inline floatT get_rand(uint4* state) /// A random variable in (0,1]. template -HOST_DEVICE inline floatT get_rand_excl0(uint4* state) +__host__ __device__ inline floatT get_rand_excl0(uint4* state) { floatT xR = get_rand(state); return xR + (1.0-xR)*minVal(); @@ -168,7 +168,7 @@ class grnd_state ~grnd_state(){} void make_rng_state(unsigned int seed); - HOST_DEVICE uint4* getElement(gSite site); + __host__ __device__ uint4* getElement(gSite site); gMemoryPtr& getMemPtr(){ return memory; diff --git a/src/base/math/gsu2.h b/src/base/math/gsu2.h index 9a11f787..33ff017b 100644 --- a/src/base/math/gsu2.h +++ b/src/base/math/gsu2.h @@ -16,40 +16,40 @@ template class GSU2 { public: - HOST_DEVICE GSU2() { }; + __host__ __device__ GSU2() { }; GCOMPLEX(floatT) _e11,_e12; - HOST_DEVICE GSU2(GCOMPLEX(floatT) e11, GCOMPLEX(floatT) e12) : _e11(e11), _e12(e12) {} + __host__ __device__ GSU2(GCOMPLEX(floatT) e11, GCOMPLEX(floatT) e12) : _e11(e11), _e12(e12) {} - HOST_DEVICE friend GSU2 operator+(const GSU2 &x,const GSU2 &y) { + __host__ __device__ friend GSU2 operator+(const GSU2 &x,const GSU2 &y) { return GSU2 (x._e11+y._e11,x._e12+y._e12); } - HOST_DEVICE friend GSU2 operator-(const GSU2 &x,const GSU2 &y) { + __host__ __device__ friend GSU2 operator-(const GSU2 &x,const GSU2 &y) { return GSU2 (x._e11-y._e11,x._e12-y._e12); } - HOST_DEVICE friend GSU2 operator*(const GSU2 &x,const GCOMPLEX(floatT) &y) { + __host__ __device__ friend GSU2 operator*(const GSU2 &x,const GCOMPLEX(floatT) &y) { return GSU2 (x._e11*y,x._e12*y); } - HOST_DEVICE friend GSU2 operator*(const GCOMPLEX(floatT) &x,const GSU2 &y) { + __host__ __device__ friend GSU2 operator*(const GCOMPLEX(floatT) &x,const GSU2 &y) { return GSU2 (x*y._e11,x*y._e12); } - HOST_DEVICE friend GSU2 operator*(const GSU2 &x,const floatT &y) { + __host__ __device__ friend GSU2 operator*(const GSU2 &x,const floatT &y) { return GSU2 (x._e11*y,x._e12*y); } - HOST_DEVICE friend GSU2 operator*(const floatT &x,const GSU2 &y) { + __host__ __device__ friend GSU2 operator*(const floatT &x,const GSU2 &y) { return GSU2 (x*y._e11,x*y._e12); } - HOST_DEVICE friend GSU2 operator/(const GSU2 &x,const floatT &y) { + __host__ __device__ friend GSU2 operator/(const GSU2 &x,const floatT &y) { return GSU2 (x._e11/y,x._e12/y); } - HOST_DEVICE friend GSU2 operator*(const GSU2 &x,const GSU2 &y) { + __host__ __device__ friend GSU2 operator*(const GSU2 &x,const GSU2 &y) { GCOMPLEX(floatT) tmp1,tmp2; tmp1=y._e12; tmp2=y._e11; @@ -58,48 +58,48 @@ class GSU2 { return GSU2 (tmp1,tmp2); } - HOST_DEVICE GSU2 &operator =(const GSU2 &y) { + __host__ __device__ GSU2 &operator =(const GSU2 &y) { _e11=y._e11; _e12=y._e12; return *this; } - HOST_DEVICE GSU2 &operator+=(const GSU2 &y) { + __host__ __device__ GSU2 &operator+=(const GSU2 &y) { _e11+=y._e11; _e12+=y._e12; return *this; } - HOST_DEVICE GSU2 &operator-=(const GSU2 &y) { + __host__ __device__ GSU2 &operator-=(const GSU2 &y) { _e11-=y._e11; _e12-=y._e12; return *this; } - HOST_DEVICE GSU2 &operator*=(const GSU2 &y) { + __host__ __device__ GSU2 &operator*=(const GSU2 &y) { *this=*this*y; return *this; } - HOST_DEVICE GSU2 &operator*=(const GCOMPLEX(floatT) &y) { + __host__ __device__ GSU2 &operator*=(const GCOMPLEX(floatT) &y) { _e11*=y; _e12*=y; return *this; } - HOST_DEVICE GSU2 &operator*=(const floatT &y) { + __host__ __device__ GSU2 &operator*=(const floatT &y) { *this=*this*y; return *this; } - HOST_DEVICE GSU2 &operator/=(const floatT &y) { + __host__ __device__ GSU2 &operator/=(const floatT &y) { *this=*this/y; return *this; } - HOST_DEVICE floatT tr2() { + __host__ __device__ floatT tr2() { return( real(_e11) ); } - HOST_DEVICE GCOMPLEX(floatT) det() { + __host__ __device__ GCOMPLEX(floatT) det() { return( real(_e11) ); } - HOST_DEVICE void unitarize() { + __host__ __device__ void unitarize() { floatT res; res = real(_e11)*real(_e11) + imag(_e11)*imag(_e11) + @@ -110,7 +110,7 @@ class GSU2 { _e12=_e12*res; } - HOST_DEVICE GSU2 dagger() const { + __host__ __device__ GSU2 dagger() const { GSU2 tmp; tmp._e11 = conj(_e11); @@ -119,7 +119,7 @@ class GSU2 { return tmp; } - HOST_DEVICE floatT norm2() const { + __host__ __device__ floatT norm2() const { return (real(_e11)*real(_e11) + real(_e12)*real(_e12) + imag(_e11)*imag(_e11) + imag(_e12)*imag(_e12)); } @@ -133,7 +133,7 @@ class GSU2 { }; template -HOST_DEVICE inline GSU2 dagger(const GSU2 &x) { +__host__ __device__ inline GSU2 dagger(const GSU2 &x) { GSU2 tmp; tmp._e11 = conj(x._e11); tmp._e12 = - x._e12; @@ -141,13 +141,13 @@ HOST_DEVICE inline GSU2 dagger(const GSU2 &x) { } template -HOST_DEVICE inline floatT norm2(const GSU2 &x) { +__host__ __device__ inline floatT norm2(const GSU2 &x) { return ( real(x._e11)*real(x._e11) + real(x._e12)*real(x._e12) + imag(x._e11)*imag(x._e11) + imag(x._e12)*imag(x._e12) ); } template -HOST_DEVICE inline GSU2 sub12 (const GSU3 &u, const GSU3 &v) { +__host__ __device__ inline GSU2 sub12 (const GSU3 &u, const GSU3 &v) { GCOMPLEX(floatT) r00,r01,r10,r11; r00 = u.getLink00()*v.getLink00() + u.getLink01()*v.getLink10() + u.getLink02()*v.getLink20(); @@ -159,7 +159,7 @@ HOST_DEVICE inline GSU2 sub12 (const GSU3 &u, const GSU3 -HOST_DEVICE inline GSU2 sub13(const GSU3 &u, const GSU3 &v) { +__host__ __device__ inline GSU2 sub13(const GSU3 &u, const GSU3 &v) { GCOMPLEX(floatT) r00,r01,r10,r11; r00 = u.getLink00()*v.getLink00() + u.getLink01()*v.getLink10() + u.getLink02()*v.getLink20(); @@ -171,7 +171,7 @@ HOST_DEVICE inline GSU2 sub13(const GSU3 &u, const GSU3 } template -HOST_DEVICE inline GSU2 sub23(const GSU3 &u, const GSU3 &v) { +__host__ __device__ inline GSU2 sub23(const GSU3 &u, const GSU3 &v) { GCOMPLEX(floatT) r00,r01,r10,r11; r00 = u.getLink10()*v.getLink01() + u.getLink11()*v.getLink11() + u.getLink12()*v.getLink21(); @@ -183,7 +183,7 @@ HOST_DEVICE inline GSU2 sub23(const GSU3 &u, const GSU3 } template -HOST_DEVICE inline GSU3 sub12(const GSU2 &u, +__host__ __device__ inline GSU3 sub12(const GSU2 &u, const GSU3 &v) { return GSU3 (u._e11 *v.getLink00() + u._e12 *v.getLink10(), u._e11 *v.getLink01() + u._e12 *v.getLink11(), @@ -197,7 +197,7 @@ HOST_DEVICE inline GSU3 sub12(const GSU2 &u, } template -HOST_DEVICE inline GSU3 sub13(const GSU2 &u, const GSU3 &v) { +__host__ __device__ inline GSU3 sub13(const GSU2 &u, const GSU3 &v) { return GSU3 (u._e11 *v.getLink00() + u._e12 *v.getLink20(), u._e11 *v.getLink01() + u._e12 *v.getLink21(), u._e11 *v.getLink02() + u._e12 *v.getLink22(), @@ -210,7 +210,7 @@ HOST_DEVICE inline GSU3 sub13(const GSU2 &u, const GSU3 } template -HOST_DEVICE inline GSU3 sub23(const GSU2 &u, const GSU3 &v) { +__host__ __device__ inline GSU3 sub23(const GSU2 &u, const GSU3 &v) { return GSU3 (v.getLink00(), v.getLink01(), v.getLink02(), @@ -223,7 +223,7 @@ HOST_DEVICE inline GSU3 sub23(const GSU2 &u, const GSU3 } template -HOST_DEVICE inline floatT realtrace(const GSU3 &x) { +__host__ __device__ inline floatT realtrace(const GSU3 &x) { return ( real(x.getLink00() + x.getLink11() + x.getLink22()) ); } diff --git a/src/base/math/gsu3.h b/src/base/math/gsu3.h index 4c23728f..7be8fa71 100644 --- a/src/base/math/gsu3.h +++ b/src/base/math/gsu3.h @@ -24,73 +24,73 @@ template class GSU3; template -SQCD_HOST std::ostream &operator<<(std::ostream &, const GSU3 &); +__host__ std::ostream &operator<<(std::ostream &, const GSU3 &); template -SQCD_HOST std::istream &operator>>(std::istream &, GSU3 &); +__host__ std::istream &operator>>(std::istream &, GSU3 &); template -HOST_DEVICE inline GSU3 operator+(const GSU3 &, const GSU3 &); +__host__ __device__ inline GSU3 operator+(const GSU3 &, const GSU3 &); template -HOST_DEVICE inline GSU3 operator-(const GSU3 &, const GSU3 &); +__host__ __device__ inline GSU3 operator-(const GSU3 &, const GSU3 &); template -HOST_DEVICE inline GSU3 operator*(const GCOMPLEX(floatT) &, const GSU3 &); +__host__ __device__ inline GSU3 operator*(const GCOMPLEX(floatT) &, const GSU3 &); template -HOST_DEVICE inline GSU3 operator*(const GSU3 &, const GCOMPLEX(floatT) &); +__host__ __device__ inline GSU3 operator*(const GSU3 &, const GCOMPLEX(floatT) &); template -HOST_DEVICE inline GSU3 operator*(const floatT &, const GSU3 &); +__host__ __device__ inline GSU3 operator*(const floatT &, const GSU3 &); template -HOST_DEVICE inline GSU3 operator*(const GSU3 &, const floatT &); +__host__ __device__ inline GSU3 operator*(const GSU3 &, const floatT &); template -HOST_DEVICE inline GSU3 operator*(const GSU3 &, const GSU3 &); +__host__ __device__ inline GSU3 operator*(const GSU3 &, const GSU3 &); template -HOST_DEVICE inline GSU3 operator/(const GSU3 &, const floatT &); +__host__ __device__ inline GSU3 operator/(const GSU3 &, const floatT &); template -HOST_DEVICE floatT tr_d(const GSU3 &); +__host__ __device__ floatT tr_d(const GSU3 &); template -HOST_DEVICE floatT tr_i(const GSU3 &); +__host__ __device__ floatT tr_i(const GSU3 &); template -HOST_DEVICE floatT tr_d(const GSU3 &, const GSU3 &); +__host__ __device__ floatT tr_d(const GSU3 &, const GSU3 &); template -HOST_DEVICE GCOMPLEX(floatT) tr_c(const GSU3 &); +__host__ __device__ GCOMPLEX(floatT) tr_c(const GSU3 &); template -HOST_DEVICE GCOMPLEX(floatT) tr_c(const GSU3 &, const GSU3 &); +__host__ __device__ GCOMPLEX(floatT) tr_c(const GSU3 &, const GSU3 &); template -HOST_DEVICE GSU3 dagger(const GSU3 &); +__host__ __device__ GSU3 dagger(const GSU3 &); template -HOST_DEVICE GCOMPLEX(floatT) det(const GSU3 &X); +__host__ __device__ GCOMPLEX(floatT) det(const GSU3 &X); template -HOST_DEVICE floatT realdet(const GSU3 &X); +__host__ __device__ floatT realdet(const GSU3 &X); template -HOST_DEVICE floatT infnorm(const GSU3 &X); +__host__ __device__ floatT infnorm(const GSU3 &X); template -HOST_DEVICE GSU3 su3_exp(GSU3); +__host__ __device__ GSU3 su3_exp(GSU3); template -HOST_DEVICE gVect3 operator*(const GSU3 &, const gVect3 &); +__host__ __device__ gVect3 operator*(const GSU3 &, const gVect3 &); template -HOST_DEVICE GSU3 tensor_prod(const gVect3 &, const gVect3 &); +__host__ __device__ GSU3 tensor_prod(const gVect3 &, const gVect3 &); template -HOST_DEVICE inline bool compareGSU3(GSU3 a, GSU3 b, floatT tol=1e-13); +__host__ __device__ inline bool compareGSU3(GSU3 a, GSU3 b, floatT tol=1e-13); template class GSU3 { @@ -102,9 +102,9 @@ class GSU3 { public: constexpr GSU3(const GSU3&) = default; - HOST_DEVICE GSU3() {}; + __host__ __device__ GSU3() {}; - HOST_DEVICE GSU3(const floatT x) { + __host__ __device__ GSU3(const floatT x) { _e00 = x; _e01 = x; _e02 = x; @@ -116,7 +116,7 @@ class GSU3 { _e22 = x; }; - HOST_DEVICE GSU3(GCOMPLEX(floatT) e00, GCOMPLEX(floatT) e01, GCOMPLEX(floatT) e02, + __host__ __device__ GSU3(GCOMPLEX(floatT) e00, GCOMPLEX(floatT) e01, GCOMPLEX(floatT) e02, GCOMPLEX(floatT) e10, GCOMPLEX(floatT) e11, GCOMPLEX(floatT) e12, GCOMPLEX(floatT) e20, GCOMPLEX(floatT) e21, GCOMPLEX(floatT) e22) : _e00(e00), _e01(e01), _e02(e02), @@ -126,48 +126,48 @@ class GSU3 { #if (!defined __GPUCC__) - SQCD_HOST friend std::ostream& operator<< <> (std::ostream&, const GSU3 &); + __host__ friend std::ostream& operator<< <> (std::ostream&, const GSU3 &); #endif - SQCD_HOST friend std::istream &operator>><>(std::istream &, GSU3 &); + __host__ friend std::istream &operator>><>(std::istream &, GSU3 &); // matrix operations - HOST_DEVICE friend GSU3 operator+<>(const GSU3 &, const GSU3 &); + __host__ __device__ friend GSU3 operator+<>(const GSU3 &, const GSU3 &); - HOST_DEVICE friend GSU3 operator-<>(const GSU3 &, const GSU3 &); + __host__ __device__ friend GSU3 operator-<>(const GSU3 &, const GSU3 &); - HOST_DEVICE friend GSU3 operator*<>(const GCOMPLEX(floatT) &x, const GSU3 &y); + __host__ __device__ friend GSU3 operator*<>(const GCOMPLEX(floatT) &x, const GSU3 &y); - HOST_DEVICE friend GSU3 operator*<>(const GSU3 &x, const GCOMPLEX(floatT) &y); + __host__ __device__ friend GSU3 operator*<>(const GSU3 &x, const GCOMPLEX(floatT) &y); - HOST_DEVICE friend GSU3 operator*<>(const floatT &x, const GSU3 &y); + __host__ __device__ friend GSU3 operator*<>(const floatT &x, const GSU3 &y); - HOST_DEVICE friend GSU3 operator*<>(const GSU3 &x, const floatT &y); + __host__ __device__ friend GSU3 operator*<>(const GSU3 &x, const floatT &y); - HOST_DEVICE friend GSU3 operator*<>(const GSU3 &, const GSU3 &); + __host__ __device__ friend GSU3 operator*<>(const GSU3 &, const GSU3 &); - HOST_DEVICE friend GSU3 operator/<>(const GSU3 &x, const floatT &y); + __host__ __device__ friend GSU3 operator/<>(const GSU3 &x, const floatT &y); - HOST_DEVICE bool operator==(const GSU3 &); + __host__ __device__ bool operator==(const GSU3 &); - HOST_DEVICE GSU3 &operator=(const GSU3 &); + __host__ __device__ GSU3 &operator=(const GSU3 &); - HOST_DEVICE GSU3 &operator+=(const GSU3 &); + __host__ __device__ GSU3 &operator+=(const GSU3 &); - HOST_DEVICE GSU3 &operator-=(const GSU3 &); + __host__ __device__ GSU3 &operator-=(const GSU3 &); - HOST_DEVICE GSU3 &operator*=(const floatT &); + __host__ __device__ GSU3 &operator*=(const floatT &); - HOST_DEVICE GSU3 &operator*=(const GCOMPLEX(floatT) &); + __host__ __device__ GSU3 &operator*=(const GCOMPLEX(floatT) &); - HOST_DEVICE GSU3 &operator*=(const GSU3 &); + __host__ __device__ GSU3 &operator*=(const GSU3 &); - HOST_DEVICE GSU3 &operator/=(const floatT &); + __host__ __device__ GSU3 &operator/=(const floatT &); // cast operations single <-> double precision template - HOST_DEVICE inline operator GSU3() const { + __host__ __device__ inline operator GSU3() const { return GSU3(GCOMPLEX(T)(_e00.cREAL, _e00.cIMAG), GCOMPLEX(T)(_e01.cREAL, _e01.cIMAG), GCOMPLEX(T)(_e02.cREAL, _e02.cIMAG), GCOMPLEX(T)(_e10.cREAL, _e10.cIMAG), GCOMPLEX(T)(_e11.cREAL, _e11.cIMAG), @@ -177,18 +177,18 @@ class GSU3 { } - HOST_DEVICE friend gVect3 + __host__ __device__ friend gVect3 operator*<>(const GSU3 &, const gVect3 &); // GSU3 * cvect3 multiplication - HOST_DEVICE friend GSU3 + __host__ __device__ friend GSU3 tensor_prod<>(const gVect3 &, const gVect3 &); // tensor product of two cvect3 - HOST_DEVICE friend bool + __host__ __device__ friend bool compareGSU3<>(GSU3 a, GSU3 b, floatT tol); - HOST_DEVICE void random(uint4 *state); // set links randomly - HOST_DEVICE void gauss(uint4 *state); // set links gauss - HOST_DEVICE void su3unitarize(); // project to su3 using first two rows of link - HOST_DEVICE void su3reconstruct12() // project to su3 using first two rows of link + __host__ __device__ void random(uint4 *state); // set links randomly + __host__ __device__ void gauss(uint4 *state); // set links gauss + __host__ __device__ void su3unitarize(); // project to su3 using first two rows of link + __host__ __device__ void su3reconstruct12() // project to su3 using first two rows of link { _e20 = GCOMPLEX(floatT)((_e01.cREAL * _e12.cREAL - _e01.cIMAG * _e12.cIMAG - (_e02.cREAL * _e11.cREAL - _e02.cIMAG * _e11.cIMAG)), @@ -206,7 +206,7 @@ class GSU3 { + (_e01.cIMAG * _e10.cREAL + _e01.cREAL * _e10.cIMAG))); } - HOST_DEVICE void su3reconstruct12Dagger() // project to su3 using first two rows of link + __host__ __device__ void su3reconstruct12Dagger() // project to su3 using first two rows of link { _e02 = GCOMPLEX(floatT)((_e10.cREAL * _e21.cREAL - _e10.cIMAG * _e21.cIMAG - (_e20.cREAL * _e11.cREAL - _e20.cIMAG * _e11.cIMAG)), @@ -224,7 +224,7 @@ class GSU3 { + (_e10.cIMAG * _e01.cREAL + _e10.cREAL * _e01.cIMAG))); } - HOST_DEVICE void u3reconstruct(const GCOMPLEX(floatT) phase) // project to u3 using first two rows of link + __host__ __device__ void u3reconstruct(const GCOMPLEX(floatT) phase) // project to u3 using first two rows of link { _e20 = GCOMPLEX(floatT)((_e01.cREAL * _e12.cREAL - _e01.cIMAG * _e12.cIMAG @@ -254,7 +254,7 @@ class GSU3 { _e22 *= phase; } - HOST_DEVICE void u3reconstructDagger(const GCOMPLEX(floatT) phase) // project to u3 using first two rows of link + __host__ __device__ void u3reconstructDagger(const GCOMPLEX(floatT) phase) // project to u3 using first two rows of link { _e02 = GCOMPLEX(floatT)((_e10.cREAL * _e21.cREAL - _e10.cIMAG * _e21.cIMAG @@ -282,7 +282,7 @@ class GSU3 { _e22 *= phase; } - HOST_DEVICE void reconstruct14(const GCOMPLEX(floatT) det) + __host__ __device__ void reconstruct14(const GCOMPLEX(floatT) det) { floatT amp = pow(abs(det), 1.0/3.0); GCOMPLEX(floatT) phase = det / abs(det); @@ -307,7 +307,7 @@ class GSU3 { _e22 *= phase/amp; } - HOST_DEVICE void reconstruct14Dagger(const GCOMPLEX(floatT) det) + __host__ __device__ void reconstruct14Dagger(const GCOMPLEX(floatT) det) { floatT amp = pow(abs(det), 1.0/3.0); @@ -331,43 +331,43 @@ class GSU3 { _e12 *= phase/amp; _e22 *= phase/amp; } - HOST_DEVICE void TA(); // traceless anti-hermitian of link - HOST_DEVICE friend floatT tr_d<>(const GSU3 &); // real part of trace of link - HOST_DEVICE friend floatT tr_i<>(const GSU3 &); // imaginary part of trace of link - HOST_DEVICE friend floatT + __host__ __device__ void TA(); // traceless anti-hermitian of link + __host__ __device__ friend floatT tr_d<>(const GSU3 &); // real part of trace of link + __host__ __device__ friend floatT tr_i<>(const GSU3 &); // imaginary part of trace of link + __host__ __device__ friend floatT tr_d<>(const GSU3 &, const GSU3 &); // real part of trace of link*link - HOST_DEVICE friend GCOMPLEX(floatT) tr_c<>(const GSU3 &); // trace of link - HOST_DEVICE friend GCOMPLEX(floatT) tr_c<>(const GSU3 &, + __host__ __device__ friend GCOMPLEX(floatT) tr_c<>(const GSU3 &); // trace of link + __host__ __device__ friend GCOMPLEX(floatT) tr_c<>(const GSU3 &, const GSU3 &); // trace of link*link - HOST_DEVICE friend GSU3 + __host__ __device__ friend GSU3 dagger<>(const GSU3 &); // hermitian conjugate - HOST_DEVICE friend GSU3 su3_exp<>(GSU3); // exp( link ) - HOST_DEVICE friend GCOMPLEX(floatT) det<>(const GSU3 &); - HOST_DEVICE friend floatT infnorm<>(const GSU3 &); + __host__ __device__ friend GSU3 su3_exp<>(GSU3); // exp( link ) + __host__ __device__ friend GCOMPLEX(floatT) det<>(const GSU3 &); + __host__ __device__ friend floatT infnorm<>(const GSU3 &); // accessors - HOST_DEVICE inline GCOMPLEX(floatT) getLink00() const; - HOST_DEVICE inline GCOMPLEX(floatT) getLink01() const; - HOST_DEVICE inline GCOMPLEX(floatT) getLink02() const; - HOST_DEVICE inline GCOMPLEX(floatT) getLink10() const; - HOST_DEVICE inline GCOMPLEX(floatT) getLink11() const; - HOST_DEVICE inline GCOMPLEX(floatT) getLink12() const; - HOST_DEVICE inline GCOMPLEX(floatT) getLink20() const; - HOST_DEVICE inline GCOMPLEX(floatT) getLink21() const; - HOST_DEVICE inline GCOMPLEX(floatT) getLink22() const; + __host__ __device__ inline GCOMPLEX(floatT) getLink00() const; + __host__ __device__ inline GCOMPLEX(floatT) getLink01() const; + __host__ __device__ inline GCOMPLEX(floatT) getLink02() const; + __host__ __device__ inline GCOMPLEX(floatT) getLink10() const; + __host__ __device__ inline GCOMPLEX(floatT) getLink11() const; + __host__ __device__ inline GCOMPLEX(floatT) getLink12() const; + __host__ __device__ inline GCOMPLEX(floatT) getLink20() const; + __host__ __device__ inline GCOMPLEX(floatT) getLink21() const; + __host__ __device__ inline GCOMPLEX(floatT) getLink22() const; // setters - HOST_DEVICE inline void setLink00(GCOMPLEX(floatT) x); - HOST_DEVICE inline void setLink01(GCOMPLEX(floatT) x); - HOST_DEVICE inline void setLink02(GCOMPLEX(floatT) x); - HOST_DEVICE inline void setLink10(GCOMPLEX(floatT) x); - HOST_DEVICE inline void setLink11(GCOMPLEX(floatT) x); - HOST_DEVICE inline void setLink12(GCOMPLEX(floatT) x); - HOST_DEVICE inline void setLink20(GCOMPLEX(floatT) x); - HOST_DEVICE inline void setLink21(GCOMPLEX(floatT) x); - HOST_DEVICE inline void setLink22(GCOMPLEX(floatT) x); - - HOST_DEVICE inline GCOMPLEX(floatT) &operator()(int i, int j) { + __host__ __device__ inline void setLink00(GCOMPLEX(floatT) x); + __host__ __device__ inline void setLink01(GCOMPLEX(floatT) x); + __host__ __device__ inline void setLink02(GCOMPLEX(floatT) x); + __host__ __device__ inline void setLink10(GCOMPLEX(floatT) x); + __host__ __device__ inline void setLink11(GCOMPLEX(floatT) x); + __host__ __device__ inline void setLink12(GCOMPLEX(floatT) x); + __host__ __device__ inline void setLink20(GCOMPLEX(floatT) x); + __host__ __device__ inline void setLink21(GCOMPLEX(floatT) x); + __host__ __device__ inline void setLink22(GCOMPLEX(floatT) x); + + __host__ __device__ inline GCOMPLEX(floatT) &operator()(int i, int j) { switch (i * 3 + j) { case 0: return _e00; @@ -392,7 +392,7 @@ class GSU3 { return _e00; } - SQCD_HOST inline const GCOMPLEX(floatT) &operator()(int i, int j) const { + __host__ inline const GCOMPLEX(floatT) &operator()(int i, int j) const { switch (i * 3 + j) { case 0: return _e00; @@ -416,112 +416,112 @@ class GSU3 { throw std::runtime_error(stdLogger.fatal("GSU3 access to element (", i, ",", j, ") not possible!")); } - HOST_DEVICE GSU3 getAccessor() const { + __host__ __device__ GSU3 getAccessor() const { return *this; } template - HOST_DEVICE GSU3 operator()(const Index) const { + __host__ __device__ GSU3 operator()(const Index) const { return *this; } }; // accessors template -HOST_DEVICE inline GCOMPLEX(floatT) GSU3::getLink00() const { +__host__ __device__ inline GCOMPLEX(floatT) GSU3::getLink00() const { return _e00; } template -HOST_DEVICE inline GCOMPLEX(floatT) GSU3::getLink01() const { +__host__ __device__ inline GCOMPLEX(floatT) GSU3::getLink01() const { return _e01; } template -HOST_DEVICE inline GCOMPLEX(floatT) GSU3::getLink02() const { +__host__ __device__ inline GCOMPLEX(floatT) GSU3::getLink02() const { return _e02; } template -HOST_DEVICE inline GCOMPLEX(floatT) GSU3::getLink10() const { +__host__ __device__ inline GCOMPLEX(floatT) GSU3::getLink10() const { return _e10; } template -HOST_DEVICE inline GCOMPLEX(floatT) GSU3::getLink11() const { +__host__ __device__ inline GCOMPLEX(floatT) GSU3::getLink11() const { return _e11; } template -HOST_DEVICE inline GCOMPLEX(floatT) GSU3::getLink12() const { +__host__ __device__ inline GCOMPLEX(floatT) GSU3::getLink12() const { return _e12; } template -HOST_DEVICE inline GCOMPLEX(floatT) GSU3::getLink20() const { +__host__ __device__ inline GCOMPLEX(floatT) GSU3::getLink20() const { return _e20; } template -HOST_DEVICE inline GCOMPLEX(floatT) GSU3::getLink21() const { +__host__ __device__ inline GCOMPLEX(floatT) GSU3::getLink21() const { return _e21; } template -HOST_DEVICE inline GCOMPLEX(floatT) GSU3::getLink22() const { +__host__ __device__ inline GCOMPLEX(floatT) GSU3::getLink22() const { return _e22; } // setters template -HOST_DEVICE inline void GSU3::setLink00(GCOMPLEX(floatT) x) { +__host__ __device__ inline void GSU3::setLink00(GCOMPLEX(floatT) x) { _e00 = x; } template -HOST_DEVICE inline void GSU3::setLink01(GCOMPLEX(floatT) x) { +__host__ __device__ inline void GSU3::setLink01(GCOMPLEX(floatT) x) { _e01 = x; } template -HOST_DEVICE inline void GSU3::setLink02(GCOMPLEX(floatT) x) { +__host__ __device__ inline void GSU3::setLink02(GCOMPLEX(floatT) x) { _e02 = x; } template -HOST_DEVICE inline void GSU3::setLink10(GCOMPLEX(floatT) x) { +__host__ __device__ inline void GSU3::setLink10(GCOMPLEX(floatT) x) { _e10 = x; } template -HOST_DEVICE inline void GSU3::setLink11(GCOMPLEX(floatT) x) { +__host__ __device__ inline void GSU3::setLink11(GCOMPLEX(floatT) x) { _e11 = x; } template -HOST_DEVICE inline void GSU3::setLink12(GCOMPLEX(floatT) x) { +__host__ __device__ inline void GSU3::setLink12(GCOMPLEX(floatT) x) { _e12 = x; } template -HOST_DEVICE inline void GSU3::setLink20(GCOMPLEX(floatT) x) { +__host__ __device__ inline void GSU3::setLink20(GCOMPLEX(floatT) x) { _e20 = x; } template -HOST_DEVICE inline void GSU3::setLink21(GCOMPLEX(floatT) x) { +__host__ __device__ inline void GSU3::setLink21(GCOMPLEX(floatT) x) { _e21 = x; } template -HOST_DEVICE inline void GSU3::setLink22(GCOMPLEX(floatT) x) { +__host__ __device__ inline void GSU3::setLink22(GCOMPLEX(floatT) x) { _e22 = x; } // some constant su3 matrices template -HOST_DEVICE inline GSU3 gsu3_one() { +__host__ __device__ inline GSU3 gsu3_one() { return GSU3(1, 0, 0, 0, 1, 0, 0, 0, 1); @@ -529,7 +529,7 @@ HOST_DEVICE inline GSU3 gsu3_one() { #if ! defined(USE_HIP_AMD) && ! defined(USE_CPU_ONLY) template <> -HOST_DEVICE inline GSU3<__half> gsu3_one() { +__host__ __device__ inline GSU3<__half> gsu3_one() { GPUcomplex<__half> g_one(__float2half(1.0)); GPUcomplex<__half> g_zero(__float2half(0.0)); @@ -540,63 +540,63 @@ HOST_DEVICE inline GSU3<__half> gsu3_one() { #endif template -HOST_DEVICE inline GSU3 gsu3_zero() { +__host__ __device__ inline GSU3 gsu3_zero() { return GSU3(0, 0, 0, 0, 0, 0, 0, 0, 0); } template -HOST_DEVICE inline GSU3 glambda_1() { +__host__ __device__ inline GSU3 glambda_1() { return GSU3(0, 1, 0, 1, 0, 0, 0, 0, 0); } template -HOST_DEVICE inline GSU3 glambda_2() { +__host__ __device__ inline GSU3 glambda_2() { return GSU3(0 , -GCOMPLEX(floatT)(0, 1), 0, GCOMPLEX(floatT)(0, 1), 0 , 0, 0 , 0 , 0); } template -HOST_DEVICE inline GSU3 glambda_3() { +__host__ __device__ inline GSU3 glambda_3() { return GSU3(1, 0 , 0, 0, -1, 0, 0, 0 , 0); } template -HOST_DEVICE inline GSU3 glambda_4() { +__host__ __device__ inline GSU3 glambda_4() { return GSU3(0, 0, 1, 0, 0, 0, 1, 0, 0); } template -HOST_DEVICE inline GSU3 glambda_5() { +__host__ __device__ inline GSU3 glambda_5() { return GSU3(0 , 0, -GCOMPLEX(floatT)(0, 1), 0 , 0, 0, GCOMPLEX(floatT)(0, 1), 0, 0); } template -HOST_DEVICE inline GSU3 glambda_6() { +__host__ __device__ inline GSU3 glambda_6() { return GSU3(0, 0, 0, 0, 0, 1, 0, 1, 0); } template -HOST_DEVICE inline GSU3 glambda_7() { +__host__ __device__ inline GSU3 glambda_7() { return GSU3(0, 0 , 0, 0, 0 , -GCOMPLEX(floatT)(0, 1), 0, GCOMPLEX(floatT)(0, 1), 0); } template -HOST_DEVICE inline GSU3 glambda_8() { +__host__ __device__ inline GSU3 glambda_8() { return GSU3(1 / sqrt(3), 0 , 0, 0 , 1 / sqrt(3), 0, 0 , 0 , -2 / sqrt(3)); @@ -607,7 +607,7 @@ HOST_DEVICE inline GSU3 glambda_8() { // matrix operations template -HOST_DEVICE GSU3 operator+(const GSU3 &x, const GSU3 &y) { +__host__ __device__ GSU3 operator+(const GSU3 &x, const GSU3 &y) { return GSU3( x._e00 + y._e00, x._e01 + y._e01, x._e02 + y._e02, x._e10 + y._e10, x._e11 + y._e11, x._e12 + y._e12, @@ -615,7 +615,7 @@ HOST_DEVICE GSU3 operator+(const GSU3 &x, const GSU3 &y) } template -HOST_DEVICE GSU3 operator-(const GSU3 &x, const GSU3 &y) { +__host__ __device__ GSU3 operator-(const GSU3 &x, const GSU3 &y) { return GSU3( x._e00 - y._e00, x._e01 - y._e01, x._e02 - y._e02, x._e10 - y._e10, x._e11 - y._e11, x._e12 - y._e12, @@ -624,7 +624,7 @@ HOST_DEVICE GSU3 operator-(const GSU3 &x, const GSU3 &y) template -HOST_DEVICE GSU3 operator*(const GCOMPLEX(floatT) &x, const GSU3 &y) { +__host__ __device__ GSU3 operator*(const GCOMPLEX(floatT) &x, const GSU3 &y) { return GSU3( x * y._e00, x * y._e01, x * y._e02, x * y._e10, x * y._e11, x * y._e12, @@ -632,7 +632,7 @@ HOST_DEVICE GSU3 operator*(const GCOMPLEX(floatT) &x, const GSU3 } template -HOST_DEVICE GSU3 operator*(const GSU3 &x, const GCOMPLEX(floatT) &y) { +__host__ __device__ GSU3 operator*(const GSU3 &x, const GCOMPLEX(floatT) &y) { return GSU3( x._e00 * y, x._e01 * y, x._e02 * y, x._e10 * y, x._e11 * y, x._e12 * y, @@ -640,7 +640,7 @@ HOST_DEVICE GSU3 operator*(const GSU3 &x, const GCOMPLEX(floatT) } template -HOST_DEVICE GSU3 operator*(const floatT &x, const GSU3 &y) { +__host__ __device__ GSU3 operator*(const floatT &x, const GSU3 &y) { return GSU3( x * y._e00, x * y._e01, x * y._e02, x * y._e10, x * y._e11, x * y._e12, @@ -648,7 +648,7 @@ HOST_DEVICE GSU3 operator*(const floatT &x, const GSU3 &y) { } template -HOST_DEVICE GSU3 operator*(const GSU3 &x, const floatT &y) { +__host__ __device__ GSU3 operator*(const GSU3 &x, const floatT &y) { return GSU3( x._e00 * y, x._e01 * y, x._e02 * y, x._e10 * y, x._e11 * y, x._e12 * y, @@ -656,7 +656,7 @@ HOST_DEVICE GSU3 operator*(const GSU3 &x, const floatT &y) { } template -HOST_DEVICE GSU3 operator/(const GSU3 &x, const floatT &y) { +__host__ __device__ GSU3 operator/(const GSU3 &x, const floatT &y) { return GSU3( x._e00 / y, x._e01 / y, x._e02 / y, x._e10 / y, x._e11 / y, x._e12 / y, @@ -665,7 +665,7 @@ HOST_DEVICE GSU3 operator/(const GSU3 &x, const floatT &y) { template -HOST_DEVICE GSU3 operator*(const GSU3 &x, const GSU3 &y) { +__host__ __device__ GSU3 operator*(const GSU3 &x, const GSU3 &y) { GCOMPLEX(floatT) tmp00, tmp01, tmp02, tmp10, tmp11, tmp12, tmp20, tmp21, tmp22; @@ -688,7 +688,7 @@ HOST_DEVICE GSU3 operator*(const GSU3 &x, const GSU3 &y) // su3 * cvect3 multiplication template -HOST_DEVICE gVect3 operator*(const GSU3 &x, const gVect3 &y) { +__host__ __device__ gVect3 operator*(const GSU3 &x, const gVect3 &y) { GCOMPLEX(floatT) tmp0, tmp1, tmp2; tmp0 = x._e00 * y._v0 + x._e01 * y._v1 + x._e02 * y._v2; @@ -700,7 +700,7 @@ HOST_DEVICE gVect3 operator*(const GSU3 &x, const gVect3 template -HOST_DEVICE inline GSU3 &GSU3::operator=(const GSU3 &y) { +__host__ __device__ inline GSU3 &GSU3::operator=(const GSU3 &y) { _e00 = y._e00; _e01 = y._e01; _e02 = y._e02; @@ -714,7 +714,7 @@ HOST_DEVICE inline GSU3 &GSU3::operator=(const GSU3 &y) } template -HOST_DEVICE GSU3 &GSU3::operator+=(const GSU3 &y) { +__host__ __device__ GSU3 &GSU3::operator+=(const GSU3 &y) { _e00 += y._e00; _e01 += y._e01; _e02 += y._e02; @@ -728,7 +728,7 @@ HOST_DEVICE GSU3 &GSU3::operator+=(const GSU3 &y) { } template -HOST_DEVICE GSU3 &GSU3::operator-=(const GSU3 &y) { +__host__ __device__ GSU3 &GSU3::operator-=(const GSU3 &y) { _e00 -= y._e00; _e01 -= y._e01; _e02 -= y._e02; @@ -742,13 +742,13 @@ HOST_DEVICE GSU3 &GSU3::operator-=(const GSU3 &y) { } template -HOST_DEVICE GSU3 &GSU3::operator*=(const floatT &y) { +__host__ __device__ GSU3 &GSU3::operator*=(const floatT &y) { *this = *this * y; return *this; } template -HOST_DEVICE GSU3 &GSU3::operator*=(const GCOMPLEX(floatT) &y) { +__host__ __device__ GSU3 &GSU3::operator*=(const GCOMPLEX(floatT) &y) { _e00 *= y; _e01 *= y; _e02 *= y; @@ -762,13 +762,13 @@ HOST_DEVICE GSU3 &GSU3::operator*=(const GCOMPLEX(floatT) &y) { } template -HOST_DEVICE GSU3 &GSU3::operator*=(const GSU3 &y) { +__host__ __device__ GSU3 &GSU3::operator*=(const GSU3 &y) { *this = *this * y; return *this; } template -HOST_DEVICE GSU3 &GSU3::operator/=(const floatT &y) { +__host__ __device__ GSU3 &GSU3::operator/=(const floatT &y) { *this = *this / y; return *this; } @@ -777,7 +777,7 @@ HOST_DEVICE GSU3 &GSU3::operator/=(const floatT &y) { /// tolerance for comparison. In that case please look to the compareGSU3 method. In case you are comparing with the /// zero matrix, you should use compareGSU3, as the present method seems not to work for that case. template -HOST_DEVICE bool GSU3::operator==(const GSU3 &y) { +__host__ __device__ bool GSU3::operator==(const GSU3 &y) { if (_e00 == y._e00 && _e01 == y._e01 && _e02 == y._e02 && @@ -792,7 +792,7 @@ HOST_DEVICE bool GSU3::operator==(const GSU3 &y) { } template -SQCD_HOST inline std::ostream &operator<<(std::ostream &s, const GSU3 &x) { +__host__ inline std::ostream &operator<<(std::ostream &s, const GSU3 &x) { return s << "\n" << x.getLink00() << x.getLink01() << x.getLink02() << "\n" << x.getLink10() << x.getLink11() << x.getLink12() << "\n" << x.getLink20() << x.getLink21() << x.getLink22() << "\n"; @@ -800,7 +800,7 @@ return s << "\n" << x.getLink00() << x.getLink01() << x.getLink02() << "\n" /// TODO: This presumably doesn't work template -SQCD_HOST inline std::istream &operator>>(std::istream &s, GSU3 &x) { +__host__ inline std::istream &operator>>(std::istream &s, GSU3 &x) { return s >> x._e00.cREAL >> x._e00.cIMAG >> x._e01.cREAL >> x._e01.cIMAG >> x._e02.cREAL >> x._e02.cIMAG >> x._e10.cREAL >> x._e10.cIMAG >> x._e11.cREAL >> x._e11.cIMAG >> x._e12.cREAL >> x._e12.cIMAG >> x._e20.cREAL >> x._e20.cIMAG >> x._e21.cREAL >> x._e21.cIMAG >> x._e22.cREAL >> x._e22.cIMAG; @@ -808,7 +808,7 @@ SQCD_HOST inline std::istream &operator>>(std::istream &s, GSU3 &x) { template -HOST_DEVICE void GSU3::random(uint4 *state) { +__host__ __device__ void GSU3::random(uint4 *state) { GCOMPLEX(floatT) rnd; @@ -837,7 +837,7 @@ HOST_DEVICE void GSU3::random(uint4 *state) { template -HOST_DEVICE void GSU3::gauss(uint4 *state) { +__host__ __device__ void GSU3::gauss(uint4 *state) { #ifndef USE_CPU_ONLY if constexpr (!std::is_same::value) { #endif @@ -903,7 +903,7 @@ HOST_DEVICE void GSU3::gauss(uint4 *state) { // project to su3 using first two rows of link template -HOST_DEVICE void GSU3::su3unitarize() { +__host__ __device__ void GSU3::su3unitarize() { #ifndef USE_CPU_ONLY if constexpr (!std::is_same::value) { #endif @@ -1034,7 +1034,7 @@ HOST_DEVICE void GSU3::su3unitarize() { } template -HOST_DEVICE GCOMPLEX(floatT) det(const GSU3 &x) { +__host__ __device__ GCOMPLEX(floatT) det(const GSU3 &x) { GCOMPLEX(floatT) res; @@ -1046,12 +1046,12 @@ HOST_DEVICE GCOMPLEX(floatT) det(const GSU3 &x) { } template -HOST_DEVICE floatT realdet(const GSU3 &x) { +__host__ __device__ floatT realdet(const GSU3 &x) { return det(x).cREAL; } template -HOST_DEVICE floatT infnorm(const GSU3 &x) { +__host__ __device__ floatT infnorm(const GSU3 &x) { floatT res = x._e00.cREAL * x._e00.cREAL; res = x._e00.cIMAG * x._e00.cIMAG + res; res = x._e01.cREAL * x._e01.cREAL + res; @@ -1084,7 +1084,7 @@ HOST_DEVICE floatT infnorm(const GSU3 &x) { // traceless anti-hermitian of link template -HOST_DEVICE void GSU3::TA() { +__host__ __device__ void GSU3::TA() { GSU3 tmp; tmp._e00 = GCOMPLEX(floatT)(0, 0.6666666666666666 * _e00.cIMAG - 0.3333333333333333 * (_e11.cIMAG + _e22.cIMAG)); @@ -1102,19 +1102,19 @@ HOST_DEVICE void GSU3::TA() { // real part of trace of link template -HOST_DEVICE floatT tr_d(const GSU3 &x) { +__host__ __device__ floatT tr_d(const GSU3 &x) { return floatT(x._e00.cREAL + x._e11.cREAL + x._e22.cREAL); } // imaginary part of trace of link template -HOST_DEVICE floatT tr_i(const GSU3 &x) { +__host__ __device__ floatT tr_i(const GSU3 &x) { return floatT(x._e00.cIMAG + x._e11.cIMAG + x._e22.cIMAG); } // real part of trace of link*link template -HOST_DEVICE floatT tr_d(const GSU3 &x, const GSU3 &y) { +__host__ __device__ floatT tr_d(const GSU3 &x, const GSU3 &y) { floatT res; res = (x._e00 * y._e00).cREAL + (x._e01 * y._e10).cREAL + (x._e02 * y._e20).cREAL + (x._e10 * y._e01).cREAL + (x._e11 * y._e11).cREAL + (x._e12 * y._e21).cREAL @@ -1125,13 +1125,13 @@ HOST_DEVICE floatT tr_d(const GSU3 &x, const GSU3 &y) { // trace of link template -HOST_DEVICE GCOMPLEX(floatT) tr_c(const GSU3 &x) { +__host__ __device__ GCOMPLEX(floatT) tr_c(const GSU3 &x) { return GCOMPLEX(floatT)(x._e00 + x._e11 + x._e22); } // trace of link*link template -HOST_DEVICE GCOMPLEX(floatT) tr_c(const GSU3 &x, const GSU3 &y) { +__host__ __device__ GCOMPLEX(floatT) tr_c(const GSU3 &x, const GSU3 &y) { GCOMPLEX(floatT) res; @@ -1144,7 +1144,7 @@ HOST_DEVICE GCOMPLEX(floatT) tr_c(const GSU3 &x, const GSU3 &y) // hermitian conjugate template -HOST_DEVICE GSU3 dagger(const GSU3 &x) { +__host__ __device__ GSU3 dagger(const GSU3 &x) { GSU3 tmp; tmp._e00 = conj(x._e00); @@ -1162,7 +1162,7 @@ HOST_DEVICE GSU3 dagger(const GSU3 &x) { // exp( link ) template -HOST_DEVICE GSU3 su3_exp(GSU3 u) { +__host__ __device__ GSU3 su3_exp(GSU3 u) { GSU3 res; res = gsu3_one() @@ -1177,7 +1177,7 @@ HOST_DEVICE GSU3 su3_exp(GSU3 u) { // tensor product of two cvect3 template -HOST_DEVICE GSU3 tensor_prod(const gVect3 &x, const gVect3 &y) { +__host__ __device__ GSU3 tensor_prod(const gVect3 &x, const gVect3 &y) { GSU3 res; res._e00 = x._v0 * y._v0; @@ -1194,7 +1194,7 @@ HOST_DEVICE GSU3 tensor_prod(const gVect3 &x, const gVect3 -HOST_DEVICE inline bool compareGSU3(GSU3 a, GSU3 b, floatT tol) { +__host__ __device__ inline bool compareGSU3(GSU3 a, GSU3 b, floatT tol) { for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) { diff --git a/src/base/math/gvect3.h b/src/base/math/gvect3.h index e4bca291..f859c613 100644 --- a/src/base/math/gvect3.h +++ b/src/base/math/gvect3.h @@ -19,29 +19,29 @@ template class GSU3; template class gVect3; template class cVect3; template class gVect3array; -template SQCD_HOST std::ostream & operator<<(std::ostream &, const gVect3 &); -template SQCD_HOST std::istream & operator>>(std::istream &, gVect3 &); -template HOST_DEVICE GCOMPLEX(floatT) operator*(const gVect3 &,const gVect3 &); -template HOST_DEVICE GCOMPLEX(floatT) complex_product(const gVect3 &,const gVect3 &); -template HOST_DEVICE GCOMPLEX(floatT) complex_product_add(const gVect3 &,const gVect3 &, const GCOMPLEX(floatT) &); - - -template HOST_DEVICE gVect3 operator+(const gVect3 &,const gVect3 &); -template HOST_DEVICE gVect3 operator-(const gVect3 &,const gVect3 &); -template HOST_DEVICE gVect3 operator*(const floatT &,const gVect3 &); -template HOST_DEVICE gVect3 operator*(const GCOMPLEX(floatT) &,const gVect3 &); -template HOST_DEVICE gVect3 operator*(const gVect3 &,const floatT &); -template HOST_DEVICE gVect3 operator*(const gVect3 &,const GCOMPLEX(floatT) &); -template HOST_DEVICE gVect3 conj(const gVect3 &); -template HOST_DEVICE floatT norm2(const gVect3 &); -template HOST_DEVICE GCOMPLEX(floatT) dot_prod(const gVect3 &,const gVect3 &); -template HOST_DEVICE floatT re_dot_prod(const gVect3 &,const gVect3 &); -template HOST_DEVICE gVect3 operator*(const GSU3 &,const gVect3 &); -template HOST_DEVICE GSU3 tensor_prod(const gVect3 &,const gVect3 &); -template HOST_DEVICE inline floatT minVal(); +template __host__ std::ostream & operator<<(std::ostream &, const gVect3 &); +template __host__ std::istream & operator>>(std::istream &, gVect3 &); +template __host__ __device__ GCOMPLEX(floatT) operator*(const gVect3 &,const gVect3 &); +template __host__ __device__ GCOMPLEX(floatT) complex_product(const gVect3 &,const gVect3 &); +template __host__ __device__ GCOMPLEX(floatT) complex_product_add(const gVect3 &,const gVect3 &, const GCOMPLEX(floatT) &); + + +template __host__ __device__ gVect3 operator+(const gVect3 &,const gVect3 &); +template __host__ __device__ gVect3 operator-(const gVect3 &,const gVect3 &); +template __host__ __device__ gVect3 operator*(const floatT &,const gVect3 &); +template __host__ __device__ gVect3 operator*(const GCOMPLEX(floatT) &,const gVect3 &); +template __host__ __device__ gVect3 operator*(const gVect3 &,const floatT &); +template __host__ __device__ gVect3 operator*(const gVect3 &,const GCOMPLEX(floatT) &); +template __host__ __device__ gVect3 conj(const gVect3 &); +template __host__ __device__ floatT norm2(const gVect3 &); +template __host__ __device__ GCOMPLEX(floatT) dot_prod(const gVect3 &,const gVect3 &); +template __host__ __device__ floatT re_dot_prod(const gVect3 &,const gVect3 &); +template __host__ __device__ gVect3 operator*(const GSU3 &,const gVect3 &); +template __host__ __device__ GSU3 tensor_prod(const gVect3 &,const gVect3 &); +template __host__ __device__ inline floatT minVal(); template -HOST_DEVICE inline floatT get_rand(uint4* state); +__host__ __device__ inline floatT get_rand(uint4* state); template class gVect3 @@ -52,43 +52,43 @@ class gVect3 public: - HOST_DEVICE gVect3() {}; - HOST_DEVICE gVect3(GCOMPLEX(floatT) v0) : _v0(v0), _v1(v0), _v2(v0) {}; - HOST_DEVICE gVect3(floatT v0) : _v0(v0), _v1(v0), _v2(v0) {}; - HOST_DEVICE gVect3(GCOMPLEX(floatT) v0, GCOMPLEX(floatT) v1, GCOMPLEX(floatT) v2) : _v0(v0), _v1(v1), _v2(v2) {}; + __host__ __device__ gVect3() {}; + __host__ __device__ gVect3(GCOMPLEX(floatT) v0) : _v0(v0), _v1(v0), _v2(v0) {}; + __host__ __device__ gVect3(floatT v0) : _v0(v0), _v1(v0), _v2(v0) {}; + __host__ __device__ gVect3(GCOMPLEX(floatT) v0, GCOMPLEX(floatT) v1, GCOMPLEX(floatT) v2) : _v0(v0), _v1(v1), _v2(v2) {}; #if (!defined __GPUCC__) - SQCD_HOST friend std::ostream &operator << <> (std::ostream &, const gVect3 &); + __host__ friend std::ostream &operator << <> (std::ostream &, const gVect3 &); #endif - SQCD_HOST friend std::istream &operator >> <> (std::istream &, gVect3 &); + __host__ friend std::istream &operator >> <> (std::istream &, gVect3 &); friend class gVect3array; friend class gVect3array; // vector operations - HOST_DEVICE gVect3 &operator =(const gVect3 &); - HOST_DEVICE gVect3 &operator-=(const gVect3 &); - HOST_DEVICE gVect3 &operator+=(const gVect3 &); - HOST_DEVICE gVect3 &operator*=(const floatT &); - HOST_DEVICE gVect3 &operator*=(const GCOMPLEX(floatT) &); - HOST_DEVICE friend GCOMPLEX(floatT) operator* <> (const gVect3 &,const gVect3 &); - HOST_DEVICE friend GCOMPLEX(floatT) complex_product <> (const gVect3 &,const gVect3 &); - HOST_DEVICE friend GCOMPLEX(floatT) complex_product_add <> (const gVect3 &,const gVect3 &, const GCOMPLEX(floatT) & ); - HOST_DEVICE friend gVect3 operator+ <> (const gVect3 &,const gVect3 &); - HOST_DEVICE friend gVect3 operator- <> (const gVect3 &,const gVect3 &); - HOST_DEVICE friend gVect3 operator* <> (const floatT &,const gVect3 &); - HOST_DEVICE friend gVect3 operator* <> (const GCOMPLEX(floatT) &,const gVect3 &); - HOST_DEVICE friend gVect3 operator* <> (const gVect3 &,const floatT &); - HOST_DEVICE friend gVect3 operator* <> (const gVect3 &,const GCOMPLEX(floatT) &); - - HOST_DEVICE friend gVect3 conj <> (const gVect3 &); // complex conjugate - HOST_DEVICE friend floatT norm2 <> (const gVect3 &); // norm2 - HOST_DEVICE friend GCOMPLEX(floatT) dot_prod <> (const gVect3&, const gVect3&); // true complex dot product - HOST_DEVICE friend floatT re_dot_prod <> (const gVect3 &,const gVect3 &); // real part of dot product + __host__ __device__ gVect3 &operator =(const gVect3 &); + __host__ __device__ gVect3 &operator-=(const gVect3 &); + __host__ __device__ gVect3 &operator+=(const gVect3 &); + __host__ __device__ gVect3 &operator*=(const floatT &); + __host__ __device__ gVect3 &operator*=(const GCOMPLEX(floatT) &); + __host__ __device__ friend GCOMPLEX(floatT) operator* <> (const gVect3 &,const gVect3 &); + __host__ __device__ friend GCOMPLEX(floatT) complex_product <> (const gVect3 &,const gVect3 &); + __host__ __device__ friend GCOMPLEX(floatT) complex_product_add <> (const gVect3 &,const gVect3 &, const GCOMPLEX(floatT) & ); + __host__ __device__ friend gVect3 operator+ <> (const gVect3 &,const gVect3 &); + __host__ __device__ friend gVect3 operator- <> (const gVect3 &,const gVect3 &); + __host__ __device__ friend gVect3 operator* <> (const floatT &,const gVect3 &); + __host__ __device__ friend gVect3 operator* <> (const GCOMPLEX(floatT) &,const gVect3 &); + __host__ __device__ friend gVect3 operator* <> (const gVect3 &,const floatT &); + __host__ __device__ friend gVect3 operator* <> (const gVect3 &,const GCOMPLEX(floatT) &); + + __host__ __device__ friend gVect3 conj <> (const gVect3 &); // complex conjugate + __host__ __device__ friend floatT norm2 <> (const gVect3 &); // norm2 + __host__ __device__ friend GCOMPLEX(floatT) dot_prod <> (const gVect3&, const gVect3&); // true complex dot product + __host__ __device__ friend floatT re_dot_prod <> (const gVect3 &,const gVect3 &); // real part of dot product template - HOST_DEVICE void random( rndstateT * const); // set gvect3 randomly - HOST_DEVICE void gauss( uint4 * state ) + __host__ __device__ void random( rndstateT * const); // set gvect3 randomly + __host__ __device__ void gauss( uint4 * state ) { #if ! defined(USE_HIP_AMD) && ! defined(USE_CPU_ONLY) if constexpr (!std::is_same::value) { @@ -142,58 +142,58 @@ class gVect3 // cast operations single <-> double precision template - HOST_DEVICE operator gVect3 () const { + __host__ __device__ operator gVect3 () const { return gVect3( GCOMPLEX(T)(_v0.cREAL, _v0.cIMAG), GCOMPLEX(T)(_v1.cREAL, _v1.cIMAG), GCOMPLEX(T)(_v2.cREAL, _v2.cIMAG) ); } - HOST_DEVICE friend gVect3 operator* <> (const GSU3 &,const gVect3 &); // gsu3 * gvect3 multiplication - HOST_DEVICE friend GSU3 tensor_prod <> (const gVect3 &,const gVect3 &); // tensor product of two gvect3 + __host__ __device__ friend gVect3 operator* <> (const GSU3 &,const gVect3 &); // gsu3 * gvect3 multiplication + __host__ __device__ friend GSU3 tensor_prod <> (const gVect3 &,const gVect3 &); // tensor product of two gvect3 - HOST_DEVICE inline GCOMPLEX(floatT) getElement0() const { + __host__ __device__ inline GCOMPLEX(floatT) getElement0() const { return _v0; }; - HOST_DEVICE inline GCOMPLEX(floatT) getElement1()const { + __host__ __device__ inline GCOMPLEX(floatT) getElement1()const { return _v1; }; - HOST_DEVICE inline GCOMPLEX(floatT) getElement2() const { + __host__ __device__ inline GCOMPLEX(floatT) getElement2() const { return _v2; }; - HOST_DEVICE inline void addtoElement0(const GCOMPLEX(floatT) a){ + __host__ __device__ inline void addtoElement0(const GCOMPLEX(floatT) a){ _v0 += a; } - HOST_DEVICE inline void addtoElement1(const GCOMPLEX(floatT) a){ + __host__ __device__ inline void addtoElement1(const GCOMPLEX(floatT) a){ _v1 += a; } - HOST_DEVICE inline void addtoElement2(const GCOMPLEX(floatT) a){ + __host__ __device__ inline void addtoElement2(const GCOMPLEX(floatT) a){ _v2 += a; } - HOST_DEVICE inline void setElement0(const GCOMPLEX(floatT)& a){ + __host__ __device__ inline void setElement0(const GCOMPLEX(floatT)& a){ _v0 = a; } - HOST_DEVICE inline void setElement1(const GCOMPLEX(floatT)& a){ + __host__ __device__ inline void setElement1(const GCOMPLEX(floatT)& a){ _v1 = a; } - HOST_DEVICE inline void setElement2(const GCOMPLEX(floatT)& a){ + __host__ __device__ inline void setElement2(const GCOMPLEX(floatT)& a){ _v2 = a; } - HOST_DEVICE inline void subfromElement0(const GCOMPLEX(floatT) a){ + __host__ __device__ inline void subfromElement0(const GCOMPLEX(floatT) a){ _v0 -= a; } - HOST_DEVICE inline void subfromElement1(const GCOMPLEX(floatT) a){ + __host__ __device__ inline void subfromElement1(const GCOMPLEX(floatT) a){ _v1 -= a; } - HOST_DEVICE inline void subfromElement2(const GCOMPLEX(floatT) a){ + __host__ __device__ inline void subfromElement2(const GCOMPLEX(floatT) a){ _v2 -= a; } - HOST_DEVICE inline GCOMPLEX(floatT)& operator() (int i) { + __host__ __device__ inline GCOMPLEX(floatT)& operator() (int i) { switch (i) { case 0: return _v0; @@ -207,12 +207,12 @@ class gVect3 } - HOST_DEVICE gVect3 getAccessor() const{ + __host__ __device__ gVect3 getAccessor() const{ return *this; } template - HOST_DEVICE gVect3 operator()(const Index) const { + __host__ __device__ gVect3 operator()(const Index) const { return *this; } }; @@ -220,7 +220,7 @@ class gVect3 // gvect3 = (1,0,0) or (0,1,0) or (0,0,1) template -HOST_DEVICE inline gVect3 gvect3_unity(const int& i) +__host__ __device__ inline gVect3 gvect3_unity(const int& i) { switch ( i ) { @@ -251,7 +251,7 @@ return gVect3<__half> (__float2half(1), __float2half(0), __float2half(0)); #endif // cvect3 = (1,1,1) template -HOST_DEVICE inline gVect3 gvect3_one() +__host__ __device__ inline gVect3 gvect3_one() { return gVect3 (1, 1, 1); } @@ -260,7 +260,7 @@ HOST_DEVICE inline gVect3 gvect3_one() // cvect3 = (0,0,0) template -HOST_DEVICE inline gVect3 gvect3_zero() +__host__ __device__ inline gVect3 gvect3_zero() { return gVect3 (0, 0, 0); } @@ -272,7 +272,7 @@ __device__ inline gVect3<__half> gvect3_zero() } #endif template -HOST_DEVICE gVect3 &gVect3::operator=(const gVect3 &y) +__host__ __device__ gVect3 &gVect3::operator=(const gVect3 &y) { _v0 = y._v0; _v1 = y._v1; @@ -281,7 +281,7 @@ HOST_DEVICE gVect3 &gVect3::operator=(const gVect3 &y) } template -HOST_DEVICE gVect3 &gVect3::operator-=(const gVect3 &y) +__host__ __device__ gVect3 &gVect3::operator-=(const gVect3 &y) { _v0-= y._v0; _v1-= y._v1; @@ -290,7 +290,7 @@ HOST_DEVICE gVect3 &gVect3::operator-=(const gVect3 &y) } template -HOST_DEVICE gVect3 &gVect3::operator+=(const gVect3 &y) +__host__ __device__ gVect3 &gVect3::operator+=(const gVect3 &y) { _v0+= y._v0; _v1+= y._v1; @@ -299,7 +299,7 @@ HOST_DEVICE gVect3 &gVect3::operator+=(const gVect3 &y) } template -HOST_DEVICE gVect3 &gVect3::operator*=(const floatT &y) +__host__ __device__ gVect3 &gVect3::operator*=(const floatT &y) { _v0*= y; _v1*= y; @@ -308,7 +308,7 @@ HOST_DEVICE gVect3 &gVect3::operator*=(const floatT &y) } template -HOST_DEVICE gVect3 &gVect3::operator*=(const GCOMPLEX(floatT) &y) +__host__ __device__ gVect3 &gVect3::operator*=(const GCOMPLEX(floatT) &y) { _v0*= y; _v1*= y; @@ -317,7 +317,7 @@ HOST_DEVICE gVect3 &gVect3::operator*=(const GCOMPLEX(floatT) &y } template -HOST_DEVICE GCOMPLEX(floatT) operator*(const gVect3 &x,const gVect3 &y) +__host__ __device__ GCOMPLEX(floatT) operator*(const gVect3 &x,const gVect3 &y) { GCOMPLEX(floatT) res = conj(x._v0) * y._v0; res += conj(x._v1) * y._v1; @@ -326,7 +326,7 @@ HOST_DEVICE GCOMPLEX(floatT) operator*(const gVect3 &x,const gVect3 -HOST_DEVICE GCOMPLEX(floatT) complex_product(const gVect3 &x,const gVect3 &y) +__host__ __device__ GCOMPLEX(floatT) complex_product(const gVect3 &x,const gVect3 &y) { // GCOMPLEX(floatT) res = x._v0 *(y._v0); // res += x._v1 * (y._v1); @@ -340,7 +340,7 @@ HOST_DEVICE GCOMPLEX(floatT) complex_product(const gVect3 &x,const gVect } template -HOST_DEVICE GCOMPLEX(floatT) complex_product_add(const gVect3 &x,const gVect3 &y, const GCOMPLEX(floatT) &d) +__host__ __device__ GCOMPLEX(floatT) complex_product_add(const gVect3 &x,const gVect3 &y, const GCOMPLEX(floatT) &d) { //GCOMPLEX(floatT) res = x._v0 *(y._v0); //res += x._v1 * (y._v1); @@ -352,7 +352,7 @@ HOST_DEVICE GCOMPLEX(floatT) complex_product_add(const gVect3 &x,const g } template -HOST_DEVICE gVect3 operator+(const gVect3 &x,const gVect3 &y) +__host__ __device__ gVect3 operator+(const gVect3 &x,const gVect3 &y) { gVect3 z; z._v0 = x._v0 + y._v0; @@ -362,7 +362,7 @@ HOST_DEVICE gVect3 operator+(const gVect3 &x,const gVect3 -HOST_DEVICE gVect3 operator-(const gVect3 &x,const gVect3 &y) +__host__ __device__ gVect3 operator-(const gVect3 &x,const gVect3 &y) { gVect3 z; z._v0 = x._v0 - y._v0; @@ -372,7 +372,7 @@ HOST_DEVICE gVect3 operator-(const gVect3 &x,const gVect3 -HOST_DEVICE gVect3 operator*(const GCOMPLEX(floatT)& x,const gVect3& y) +__host__ __device__ gVect3 operator*(const GCOMPLEX(floatT)& x,const gVect3& y) { gVect3 z; z._v0 = x * y._v0; @@ -382,7 +382,7 @@ HOST_DEVICE gVect3 operator*(const GCOMPLEX(floatT)& x,const gVect3 -HOST_DEVICE gVect3 operator*(const floatT & x,const gVect3& y) +__host__ __device__ gVect3 operator*(const floatT & x,const gVect3& y) { gVect3 z; z._v0 = x * y._v0; @@ -392,7 +392,7 @@ HOST_DEVICE gVect3 operator*(const floatT & x,const gVect3& y) } template -HOST_DEVICE gVect3 operator*(const gVect3& x,const GCOMPLEX(floatT)& y) +__host__ __device__ gVect3 operator*(const gVect3& x,const GCOMPLEX(floatT)& y) { gVect3 z; z._v0 = x._v0 * y; @@ -402,7 +402,7 @@ HOST_DEVICE gVect3 operator*(const gVect3& x,const GCOMPLEX(floa } template -HOST_DEVICE gVect3 operator*(const gVect3& x,const floatT & y) +__host__ __device__ gVect3 operator*(const gVect3& x,const floatT & y) { gVect3 z; z._v0 = x._v0 * y; @@ -413,7 +413,7 @@ HOST_DEVICE gVect3 operator*(const gVect3& x,const floatT & y) //! complex dot product x*y = sum_i(v_i conj(w_i)) template -HOST_DEVICE GCOMPLEX(floatT) dot_prod(const gVect3 &x,const gVect3 &y) +__host__ __device__ GCOMPLEX(floatT) dot_prod(const gVect3 &x,const gVect3 &y) { floatT real = x._v0.cREAL*y._v0.cREAL + x._v0.cIMAG*y._v0.cIMAG; real += x._v1.cREAL*y._v1.cREAL + x._v1.cIMAG*y._v1.cIMAG; @@ -426,7 +426,7 @@ HOST_DEVICE GCOMPLEX(floatT) dot_prod(const gVect3 &x,const gVect3 -HOST_DEVICE floatT re_dot_prod(const gVect3 &x,const gVect3 &y) +__host__ __device__ floatT re_dot_prod(const gVect3 &x,const gVect3 &y) { floatT res = x._v0.cREAL*y._v0.cREAL + x._v0.cIMAG*y._v0.cIMAG; res += x._v1.cREAL*y._v1.cREAL + x._v1.cIMAG*y._v1.cIMAG; @@ -436,7 +436,7 @@ HOST_DEVICE floatT re_dot_prod(const gVect3 &x,const gVect3 &y) // norm2 of vector template -HOST_DEVICE floatT norm2(const gVect3 &x) +__host__ __device__ floatT norm2(const gVect3 &x) { floatT res = x._v0.cREAL*x._v0.cREAL + x._v0.cIMAG*x._v0.cIMAG; res += x._v1.cREAL*x._v1.cREAL + x._v1.cIMAG*x._v1.cIMAG; @@ -446,7 +446,7 @@ HOST_DEVICE floatT norm2(const gVect3 &x) // complex conjugate template -HOST_DEVICE gVect3 conj(const gVect3 &x) +__host__ __device__ gVect3 conj(const gVect3 &x) { gVect3 z; z._v0 = conj(x._v0); @@ -459,13 +459,13 @@ HOST_DEVICE gVect3 conj(const gVect3 &x) #ifdef __GPUCC__ template -SQCD_HOST std::ostream &operator << (std::ostream &s, const gVect3 &x) +__host__ std::ostream &operator << (std::ostream &s, const gVect3 &x) { return s << x.getElement0() << x.getElement1() << x.getElement2(); } template -SQCD_HOST std::istream &operator >> (std::istream &s, gVect3 &x) +__host__ std::istream &operator >> (std::istream &s, gVect3 &x) { return s >> x._v0.cREAL >> x._v0.cIMAG >> x._v1.cREAL >> x._v1.cIMAG >> x._v2.cREAL >> x._v2.cIMAG; } diff --git a/src/base/math/gvect3array.h b/src/base/math/gvect3array.h index 0a619eff..8f655b39 100644 --- a/src/base/math/gvect3array.h +++ b/src/base/math/gvect3array.h @@ -17,13 +17,13 @@ struct gVect3arrayAcc : public GeneralAccessor { : GeneralAccessor(elements) { } - HOST_DEVICE explicit gVect3arrayAcc(GCOMPLEX(floatT) *elementsBase, size_t object_count) + __host__ __device__ explicit gVect3arrayAcc(GCOMPLEX(floatT) *elementsBase, size_t object_count) : GeneralAccessor(elementsBase, object_count) { } explicit gVect3arrayAcc() : GeneralAccessor() { } template - HOST_DEVICE inline gVect3 getElement(const gSite &site) const { + __host__ __device__ inline gVect3 getElement(const gSite &site) const { return static_cast>(gVect3( this->template getElementEntry<0>(site.isiteFull), this->template getElementEntry<1>(site.isiteFull), @@ -31,14 +31,14 @@ struct gVect3arrayAcc : public GeneralAccessor { } template - HOST_DEVICE inline void setElement(const gSite &site, const gVect3 &vec) { + __host__ __device__ inline void setElement(const gSite &site, const gVect3 &vec) { this->template setElementEntry<0>(site.isiteFull, vec.getElement0()); this->template setElementEntry<1>(site.isiteFull, vec.getElement1()); this->template setElementEntry<2>(site.isiteFull, vec.getElement2()); } template - HOST_DEVICE inline gVect3 getElement(const gSiteStack &site) const { + __host__ __device__ inline gVect3 getElement(const gSiteStack &site) const { gVect3 ret( this->template getElementEntry<0>(site.isiteStackFull), this->template getElementEntry<1>(site.isiteStackFull), @@ -47,13 +47,13 @@ struct gVect3arrayAcc : public GeneralAccessor { } template - HOST_DEVICE inline void setElement(const gSiteStack &site, const gVect3 &vec) { + __host__ __device__ inline void setElement(const gSiteStack &site, const gVect3 &vec) { this->template setElementEntry<0>(site.isiteStackFull, vec.getElement0()); this->template setElementEntry<1>(site.isiteStackFull, vec.getElement1()); this->template setElementEntry<2>(site.isiteStackFull, vec.getElement2()); } - HOST_DEVICE inline void setEntriesComm(gVect3arrayAcc &src_acc, + __host__ __device__ inline void setEntriesComm(gVect3arrayAcc &src_acc, size_t setIndex, size_t getIndex) { this->template setElementEntry<0>(setIndex, src_acc.template getElementEntry<0>(getIndex)); this->template setElementEntry<1>(setIndex, src_acc.template getElementEntry<1>(getIndex)); @@ -61,20 +61,20 @@ struct gVect3arrayAcc : public GeneralAccessor { } template - HOST_DEVICE inline size_t getIndexComm(size_t isiteFull, size_t stack) const { + __host__ __device__ inline size_t getIndexComm(size_t isiteFull, size_t stack) const { gSiteStack site = GIndexer::getSiteStackFull(isiteFull, stack); return site.isiteStackFull; } template - HOST_DEVICE inline gVect3 getElementComm(size_t isiteFull, size_t stack) const { + __host__ __device__ inline gVect3 getElementComm(size_t isiteFull, size_t stack) const { gSiteStack site = GIndexer::getSiteStackFull(isiteFull, stack); return getElement(site); } template - HOST_DEVICE inline void setElementComm(size_t isiteFull, size_t stack, const gVect3 &vec) { + __host__ __device__ inline void setElementComm(size_t isiteFull, size_t stack, const gVect3 &vec) { gSiteStack site; site.isiteFull = isiteFull; site.isiteStackFull = isiteFull; @@ -82,12 +82,12 @@ struct gVect3arrayAcc : public GeneralAccessor { } template - HOST_DEVICE inline gVect3 operator()(const gSite &site) const { + __host__ __device__ inline gVect3 operator()(const gSite &site) const { return this->getElement(site); }; template - HOST_DEVICE inline gVect3 operator()(const gSiteStack &site) const { + __host__ __device__ inline gVect3 operator()(const gSiteStack &site) const { return this->getElement(site); }; }; diff --git a/src/base/math/matrix4x4.h b/src/base/math/matrix4x4.h index 47fbf5e6..0235aa60 100644 --- a/src/base/math/matrix4x4.h +++ b/src/base/math/matrix4x4.h @@ -18,14 +18,14 @@ struct Matrix4x4Sym { constexpr Matrix4x4Sym(const Matrix4x4Sym&) = default; - HOST_DEVICE Matrix4x4Sym(floatT a) : elems{a, a, a, a, a, a, a, a, a, a} {} - HOST_DEVICE Matrix4x4Sym() : elems{0, 0, 0, 0, 0, 0, 0, 0, 0, 0} {} + __host__ __device__ Matrix4x4Sym(floatT a) : elems{a, a, a, a, a, a, a, a, a, a} {} + __host__ __device__ Matrix4x4Sym() : elems{0, 0, 0, 0, 0, 0, 0, 0, 0, 0} {} - HOST_DEVICE Matrix4x4Sym(floatT e00, floatT e11, floatT e22, floatT e33, floatT e01, floatT e02, floatT e03, floatT e12, + __host__ __device__ Matrix4x4Sym(floatT e00, floatT e11, floatT e22, floatT e33, floatT e01, floatT e02, floatT e03, floatT e12, floatT e13, floatT e23) : elems{e00, e11, e22, e33, e01, e02, e03, e12, e13, e23} {} - HOST_DEVICE inline floatT operator()(int mu, int nu) { + __host__ __device__ inline floatT operator()(int mu, int nu) { if (mu == 0 && nu == 0) return elems[entry::e00]; if (mu == 1 && nu == 1) return elems[entry::e11]; if (mu == 2 && nu == 2) return elems[entry::e22]; @@ -47,7 +47,7 @@ struct Matrix4x4Sym { return 0; } - HOST_DEVICE inline void operator()(int mu, int nu, floatT value) { + __host__ __device__ inline void operator()(int mu, int nu, floatT value) { if (mu == 0 && nu == 0) elems[entry::e00] = value; if (mu == 1 && nu == 1) elems[entry::e11] = value; if (mu == 2 && nu == 2) elems[entry::e22] = value; @@ -68,21 +68,21 @@ struct Matrix4x4Sym { if (nu == 2 && mu == 3) elems[entry::e23] = value; } - /* HOST_DEVICE inline Matrix4x4Sym& operator=(const floatT &y) + /* __host__ __device__ inline Matrix4x4Sym& operator=(const floatT &y) { for(int i = 0; i<10;i++){ elems[i]=y; } return *this; }*/ - HOST_DEVICE inline Matrix4x4Sym& operator=(const Matrix4x4Sym &y) + __host__ __device__ inline Matrix4x4Sym& operator=(const Matrix4x4Sym &y) { for(int i = 0; i<10;i++){ elems[i]=y.elems[i]; } return *this; } - HOST_DEVICE inline Matrix4x4Sym& operator+=(const Matrix4x4Sym &y) + __host__ __device__ inline Matrix4x4Sym& operator+=(const Matrix4x4Sym &y) { for(int i = 0; i<10;i++){ @@ -91,7 +91,7 @@ struct Matrix4x4Sym { return *this; } - HOST_DEVICE inline Matrix4x4Sym& operator/=(floatT y) + __host__ __device__ inline Matrix4x4Sym& operator/=(floatT y) { for(int i = 0; i<10;i++){ elems[i]/=y; @@ -99,7 +99,7 @@ struct Matrix4x4Sym { return *this; } - HOST_DEVICE inline Matrix4x4Sym& operator*=(floatT y) + __host__ __device__ inline Matrix4x4Sym& operator*=(floatT y) { for(int i = 0; i<10;i++){ elems[i]*=y; @@ -111,7 +111,7 @@ struct Matrix4x4Sym { template -HOST_DEVICE inline Matrix4x4Sym operator+(const Matrix4x4Sym &x, const Matrix4x4Sym &y) { +__host__ __device__ inline Matrix4x4Sym operator+(const Matrix4x4Sym &x, const Matrix4x4Sym &y) { return Matrix4x4Sym(x.elems[0]+ y.elems[0], x.elems[1]+y.elems[1], x.elems[2]+y.elems[2], x.elems[3]+y.elems[3], x.elems[4]+y.elems[4], x.elems[5]+y.elems[5], x.elems[6]+y.elems[6], x.elems[7]+y.elems[7], x.elems[8]+y.elems[8], x.elems[9]+y.elems[9]); diff --git a/src/base/math/operators.h b/src/base/math/operators.h index 11d96f73..bc575f53 100644 --- a/src/base/math/operators.h +++ b/src/base/math/operators.h @@ -106,9 +106,9 @@ struct GeneralOperator< //Call the operator: Do the operation element wise. //This is what is called in another operator or in a Kernel which runs the operation template - inline HOST_DEVICE auto operator()(const Index i) const + inline __host__ __device__ auto operator()(const Index i) const { - //inline HOST_DEVICE auto operator()(const Index i) const { + //inline __host__ __device__ auto operator()(const Index i) const { auto rhs = _rhs(i); auto lhs = _lhs(i); return lhs + rhs; @@ -142,7 +142,7 @@ struct GeneralOperator - inline HOST_DEVICE auto operator()(const Index i) const + inline __host__ __device__ auto operator()(const Index i) const { auto lhs = _lhs(i); return lhs + _rhs; @@ -174,7 +174,7 @@ struct GeneralOperator - inline HOST_DEVICE auto operator()(const Index i) const + inline __host__ __device__ auto operator()(const Index i) const { auto rhs = _rhs(i); return _lhs + rhs; @@ -206,7 +206,7 @@ struct GeneralOperator - inline HOST_DEVICE auto operator()(const Index i) const + inline __host__ __device__ auto operator()(const Index i) const { auto rhs = _rhs(i); auto lhs = _lhs(i); @@ -239,7 +239,7 @@ struct GeneralOperator - inline HOST_DEVICE auto operator()(const Index i) const + inline __host__ __device__ auto operator()(const Index i) const { auto lhs = _lhs(i); return lhs - _rhs; @@ -271,7 +271,7 @@ struct GeneralOperator - inline HOST_DEVICE auto operator()(const Index i) const + inline __host__ __device__ auto operator()(const Index i) const { auto rhs = _rhs(i); return _lhs - rhs; @@ -303,7 +303,7 @@ struct GeneralOperator - inline HOST_DEVICE auto operator()(const Index i) const + inline __host__ __device__ auto operator()(const Index i) const { auto rhs = _rhs(i); auto lhs = _lhs(i); @@ -336,7 +336,7 @@ struct GeneralOperator - inline HOST_DEVICE auto operator()(const Index i) const + inline __host__ __device__ auto operator()(const Index i) const { auto lhs = _lhs(i); return lhs * _rhs; @@ -370,7 +370,7 @@ struct GeneralOperator - inline HOST_DEVICE auto operator()(const Index i) const + inline __host__ __device__ auto operator()(const Index i) const { auto rhs = _rhs(i); return _lhs * rhs; @@ -402,7 +402,7 @@ struct GeneralOperator - inline HOST_DEVICE auto operator()(const Index i) const + inline __host__ __device__ auto operator()(const Index i) const { auto rhs = _rhs(i); auto lhs = _lhs(i); @@ -434,7 +434,7 @@ struct GeneralOperator - inline HOST_DEVICE auto operator()(const Index i) const + inline __host__ __device__ auto operator()(const Index i) const { auto lhs = _lhs(i); return lhs / _rhs; @@ -466,7 +466,7 @@ struct GeneralOperator - inline HOST_DEVICE auto operator()(const Index i) const + inline __host__ __device__ auto operator()(const Index i) const { auto rhs = _rhs(i); return _lhs / rhs; diff --git a/src/base/math/simpleArray.h b/src/base/math/simpleArray.h index 425fc562..a347fa3a 100644 --- a/src/base/math/simpleArray.h +++ b/src/base/math/simpleArray.h @@ -12,18 +12,18 @@ class SimpleArray{ public: - HOST_DEVICE T& operator[](size_t i){ + __host__ __device__ T& operator[](size_t i){ return values[i]; } - HOST_DEVICE inline auto operator()(gSiteStack site) const + __host__ __device__ inline auto operator()(gSiteStack site) const { return values[site.stack]; } - HOST_DEVICE inline auto operator()(gSiteMu site) const + __host__ __device__ inline auto operator()(gSiteMu site) const { return values[site.mu]; } @@ -31,32 +31,32 @@ class SimpleArray{ SimpleArray() = default; - HOST_DEVICE SimpleArray(const T& init){ + __host__ __device__ SimpleArray(const T& init){ for(size_t i = 0; i < N; i++){ values[i] = init; } } template - HOST_DEVICE SimpleArray(SimpleArray s_array) { + __host__ __device__ SimpleArray(SimpleArray s_array) { for(size_t i = 0; i < N; i++) { values[i] = s_array[i]; } } - HOST_DEVICE void operator=(SimpleArray vec){ + __host__ __device__ void operator=(SimpleArray vec){ for(size_t i = 0; i < N; i++){ values[i] = vec[i]; } } - SQCD_HOST void operator=(std::vector vec){ + __host__ void operator=(std::vector vec){ for(size_t i = 0; i < N; i++){ values[i] = vec.at(i); } } - HOST_DEVICE SimpleArray getAccessor() const { + __host__ __device__ SimpleArray getAccessor() const { return *this; } @@ -65,7 +65,7 @@ class SimpleArray{ template -HOST_DEVICE SimpleArray operator/(SimpleArray a, SimpleArray b){ +__host__ __device__ SimpleArray operator/(SimpleArray a, SimpleArray b){ SimpleArray ret; for(size_t i = 0; i < N; i++){ ret[i] = a[i] / b[i]; @@ -74,7 +74,7 @@ HOST_DEVICE SimpleArray operator/(SimpleArray a, SimpleArr } template -HOST_DEVICE SimpleArray operator*(SimpleArray a, SimpleArray b){ +__host__ __device__ SimpleArray operator*(SimpleArray a, SimpleArray b){ SimpleArray ret; for(size_t i = 0; i < N; i++){ ret[i] = a * b; @@ -83,7 +83,7 @@ HOST_DEVICE SimpleArray operator*(SimpleArray a, SimpleArr } template -HOST_DEVICE SimpleArray operator-(SimpleArray a, SimpleArray b){ +__host__ __device__ SimpleArray operator-(SimpleArray a, SimpleArray b){ SimpleArray ret; for(size_t i = 0; i < N; i++){ ret[i] = a[i] - b[i]; @@ -92,7 +92,7 @@ HOST_DEVICE SimpleArray operator-(SimpleArray a, SimpleArr } template -HOST_DEVICE SimpleArray operator+(SimpleArray a, SimpleArray b){ +__host__ __device__ SimpleArray operator+(SimpleArray a, SimpleArray b){ SimpleArray ret; for(size_t i = 0; i < N; i++){ ret[i] = a[i] + b[i]; @@ -101,7 +101,7 @@ HOST_DEVICE SimpleArray operator+(SimpleArray a, SimpleArr } template -HOST_DEVICE SimpleArray operator*(floatT a, SimpleArray b){ +__host__ __device__ SimpleArray operator*(floatT a, SimpleArray b){ SimpleArray ret; for(size_t i = 0; i < N; i++){ ret[i] = a * b[i]; @@ -110,7 +110,7 @@ HOST_DEVICE SimpleArray operator*(floatT a, SimpleArray b) } template - HOST_DEVICE SimpleArray operator/(SimpleArray a, floatT b){ + __host__ __device__ SimpleArray operator/(SimpleArray a, floatT b){ SimpleArray ret; for (size_t i = 0; i < N; i++) { ret[i] = a[i]/b; @@ -119,7 +119,7 @@ template } template -HOST_DEVICE floatT max(SimpleArray a){ +__host__ __device__ floatT max(SimpleArray a){ floatT ret = a[0]; for(size_t i = 1; i < N; i++){ if (a[i] > ret){ @@ -130,7 +130,7 @@ HOST_DEVICE floatT max(SimpleArray a){ } template -HOST_DEVICE SimpleArray real(SimpleArray c){ +__host__ __device__ SimpleArray real(SimpleArray c){ SimpleArray ret; for(size_t i = 0; i < N; i++){ ret[i] = c[i].cREAL; diff --git a/src/base/math/su3Exp.h b/src/base/math/su3Exp.h index 87a08990..6b9874fb 100644 --- a/src/base/math/su3Exp.h +++ b/src/base/math/su3Exp.h @@ -34,7 +34,7 @@ N = 25 by default due to an estimated error of order 10^(-26) */ template -HOST_DEVICE constexpr unsigned int countOfApproxInverseFak(){ +__host__ __device__ constexpr unsigned int countOfApproxInverseFak(){ unsigned int N = 1; floatT nominator = 1.0; @@ -48,7 +48,7 @@ HOST_DEVICE constexpr unsigned int countOfApproxInverseFak(){ // Algorithm from https://luscher.web.cern.ch/luscher/notes/su3fcts.pdf template -HOST_DEVICE inline void SU3Exp(const GSU3 inGSU3, GSU3 &outGSU3){ +__host__ __device__ inline void SU3Exp(const GSU3 inGSU3, GSU3 &outGSU3){ constexpr unsigned int N = countOfApproxInverseFak(); floatT c_i[N+1]; diff --git a/src/base/memoryManagement.h b/src/base/memoryManagement.h index f5fa72f0..9e4f9293 100644 --- a/src/base/memoryManagement.h +++ b/src/base/memoryManagement.h @@ -573,7 +573,7 @@ class MemoryAccessor { ~MemoryAccessor() = default; template - HOST_DEVICE inline void setValue(const size_t isite, const floatT value) { + __host__ __device__ inline void setValue(const size_t isite, const floatT value) { /// reinterpret_cast is a compile time directive telling the compiler to treat _Array as a floatT*. This is /// needed because _Array is treated as void* right now. auto *arr = reinterpret_cast(Array); @@ -581,7 +581,7 @@ class MemoryAccessor { } template - HOST_DEVICE inline void getValue(const size_t isite, floatT &value) { + __host__ __device__ inline void getValue(const size_t isite, floatT &value) { auto *arr = reinterpret_cast(Array); value = arr[isite]; } diff --git a/src/base/runFunctors.h b/src/base/runFunctors.h index 313a5118..6bdd0395 100644 --- a/src/base/runFunctors.h +++ b/src/base/runFunctors.h @@ -48,13 +48,13 @@ class RunFunctors { #ifdef USE_HIP_AMD -HOST_DEVICE static inline HIP_vector_type GetUint3(dim3 Idx){ +__host__ __device__ static inline HIP_vector_type GetUint3(dim3 Idx){ return HIP_vector_type(Idx.x, Idx.y, Idx.z); }; #elif defined USE_HIP_NVIDIA -HOST_DEVICE static dim3 GetUint3(dim3 Idx){ +__host__ __device__ static dim3 GetUint3(dim3 Idx){ return Idx; @@ -529,7 +529,7 @@ template struct CalcGSiteFull { template - inline HOST_DEVICE gSite operator()(Args... args) { + inline __host__ __device__ gSite operator()(Args... args) { gSite site = GIndexer::getSiteFull(args...); return site; } @@ -538,7 +538,7 @@ struct CalcGSiteFull { template struct CalcGSite { template - inline HOST_DEVICE gSite operator()(Args... args) { + inline __host__ __device__ gSite operator()(Args... args) { gSite site = GIndexer::getSite(args...); return site; } @@ -547,7 +547,7 @@ struct CalcGSite { template struct CalcGSiteSpatialFull { template - inline HOST_DEVICE gSite operator()(Args... args) { + inline __host__ __device__ gSite operator()(Args... args) { gSite site = GIndexer::getSiteSpatialFull(args...); return site; } @@ -556,7 +556,7 @@ struct CalcGSiteSpatialFull { template struct CalcGSiteSpatial { template - inline HOST_DEVICE gSite operator()(Args... args) { + inline __host__ __device__ gSite operator()(Args... args) { gSite site = GIndexer::getSiteSpatial(args...); return site; } @@ -565,7 +565,7 @@ struct CalcGSiteSpatial { template struct CalcGSiteStack { template - inline HOST_DEVICE gSiteStack operator()(Args... args) { + inline __host__ __device__ gSiteStack operator()(Args... args) { gSiteStack site = GIndexer::getSiteStack(args...); return site; } @@ -574,7 +574,7 @@ struct CalcGSiteStack { template struct CalcGSiteStackFull { template - inline HOST_DEVICE gSiteStack operator()(Args... args) { + inline __host__ __device__ gSiteStack operator()(Args... args) { gSiteStack site = GIndexer::getSiteStackFull(args...); return site; } @@ -583,7 +583,7 @@ struct CalcGSiteStackFull { template struct CalcGSiteAllMu { template - inline HOST_DEVICE gSiteMu operator()(Args... args) { + inline __host__ __device__ gSiteMu operator()(Args... args) { gSiteMu site = GIndexer::getSiteMu(args...); return site; } @@ -592,7 +592,7 @@ struct CalcGSiteAllMu { template struct CalcGSiteAtMu { template - inline HOST_DEVICE gSiteMu operator()(Args... args) { + inline __host__ __device__ gSiteMu operator()(Args... args) { gSiteMu site = GIndexer::getSiteMu(args..., mu); return site; } @@ -602,7 +602,7 @@ template struct CalcGSiteAllMuFull { template - inline HOST_DEVICE gSiteMu operator()(Args... args) { + inline __host__ __device__ gSiteMu operator()(Args... args) { gSiteMu site = GIndexer::getSiteMuFull(args...); return site; } @@ -611,7 +611,7 @@ struct CalcGSiteAllMuFull { template struct CalcGSiteAtMuFull { template - inline HOST_DEVICE gSiteMu operator()(Args... args) { + inline __host__ __device__ gSiteMu operator()(Args... args) { gSiteMu site = GIndexer::getSiteMuFull(args..., mu); return site; } @@ -620,7 +620,7 @@ struct CalcGSiteAtMuFull { template struct CalcGSiteAtStackFull { template - inline HOST_DEVICE gSiteStack operator()(Args... args) { + inline __host__ __device__ gSiteStack operator()(Args... args) { gSiteStack site = GIndexer::getSiteStackFull(args..., stack); return site; } @@ -629,7 +629,7 @@ struct CalcGSiteAtStackFull { template struct CalcGSiteAtStack { template - inline HOST_DEVICE gSiteStack operator()(Args... args) { + inline __host__ __device__ gSiteStack operator()(Args... args) { gSiteStack site = GIndexer::getSiteStack(args..., stack); return site; } @@ -639,7 +639,7 @@ struct CalcGSiteAtStack { template struct CalcOddGSiteAtStack { template - inline HOST_DEVICE gSiteStack operator()(Args... args) { + inline __host__ __device__ gSiteStack operator()(Args... args) { gSiteStack site = GIndexer::getSiteStackOdd(args..., stack); return site; } @@ -649,7 +649,7 @@ struct CalcOddGSiteAtStack { template struct CalcGSiteLoopMu { template - inline HOST_DEVICE gSite operator()(Args... args) { + inline __host__ __device__ gSite operator()(Args... args) { gSite site = GIndexer::getSite(args...); return site; } @@ -658,7 +658,7 @@ struct CalcGSiteLoopMu { template struct CalcGSiteLoopStack { template - inline HOST_DEVICE gSite operator()(Args... args) { + inline __host__ __device__ gSite operator()(Args... args) { gSite site = GIndexer::getSite(args...); return site; } @@ -667,7 +667,7 @@ struct CalcGSiteLoopStack { template struct CalcGSiteLoopMuFull { template - inline HOST_DEVICE gSite operator()(Args... args) { + inline __host__ __device__ gSite operator()(Args... args) { gSite site = GIndexer::getSiteFull(args...); return site; } @@ -676,7 +676,7 @@ struct CalcGSiteLoopMuFull { template struct CalcGSiteLoopStackFull { template - inline HOST_DEVICE gSite operator()(Args... args) { + inline __host__ __device__ gSite operator()(Args... args) { gSite site = GIndexer::getSiteFull(args...); return site; } @@ -685,39 +685,39 @@ struct CalcGSiteLoopStackFull { //! use this if you don't actually need to read in from any site, for example when initializing point sources template struct ReadDummy { - template inline HOST_DEVICE gSite operator()(__attribute__((unused)) Args... args) { + template inline __host__ __device__ gSite operator()(__attribute__((unused)) Args... args) { return GIndexer::getSite(99999,99999,99999,99999); } }; template struct WriteAtLoopMu { - inline HOST_DEVICE gSiteMu operator()(const gSite &site, size_t mu) { + inline __host__ __device__ gSiteMu operator()(const gSite &site, size_t mu) { return GIndexer::getSiteMu(site, mu); } }; template struct WriteAtLoopStack { - inline HOST_DEVICE gSiteStack operator()(const gSite &site, size_t stack) { + inline __host__ __device__ gSiteStack operator()(const gSite &site, size_t stack) { return GIndexer::getSiteStack(site, stack); } }; struct WriteAtRead { - inline HOST_DEVICE gSite operator()(const gSite &site) { + inline __host__ __device__ gSite operator()(const gSite &site) { return site; } }; struct WriteAtReadStack { - inline HOST_DEVICE gSiteStack operator()(const gSiteStack &site) { + inline __host__ __device__ gSiteStack operator()(const gSiteStack &site) { return site; } }; struct WriteAtReadMu { - inline HOST_DEVICE gSiteMu operator()(const gSiteMu &siteMu) { + inline __host__ __device__ gSiteMu operator()(const gSiteMu &siteMu) { return siteMu; } }; @@ -728,7 +728,7 @@ template struct WriteAtFixedSite { const gSite _fixed_site; explicit WriteAtFixedSite(const gSite mysite) : _fixed_site(mysite) {} - inline HOST_DEVICE gSite operator()(__attribute__((unused)) const gSite dummy) { + inline __host__ __device__ gSite operator()(__attribute__((unused)) const gSite dummy) { return _fixed_site; } }; diff --git a/src/define.h b/src/define.h index e4a3d7e1..1904fed5 100644 --- a/src/define.h +++ b/src/define.h @@ -29,10 +29,9 @@ #define AT __FILE__ ":" TOSTRING(__LINE__) #ifdef USE_CPU_ONLY -#define HOST_DEVICE -#define SQCD_HOST -#define DEVICE -#define CONSTANT const +#define __host__ +#define __device__ +#define __constant__ #define GPUERROR_T void* struct float2 { @@ -55,10 +54,6 @@ struct dim3 { constexpr operator uint3(void) const { return uint3{x, y, z}; } }; #else -#define HOST_DEVICE __host__ __device__ -#define SQCD_HOST __host__ -#define DEVICE __device__ -#define CONSTANT __constant__ #define GPUERROR_T gpuError_t #endif diff --git a/src/explicit_instantiation_macros.h b/src/explicit_instantiation_macros.h index d47f12c7..20f379d0 100644 --- a/src/explicit_instantiation_macros.h +++ b/src/explicit_instantiation_macros.h @@ -228,7 +228,7 @@ enum CompressionType { IF(BOOL(NO_GPU)) (FLOAT_LOOP_ALL(INIT_TEMPLATES, false)) #define INIT_ALL(INIT_TEMPLATES) \ - DEVICE_LOOP_ALL(INIT_TEMPLATES) + __device___LOOP_ALL(INIT_TEMPLATES) /// =================== Initialize Precision and Halos ========================================== diff --git a/src/gauge/GaugeAction.cpp b/src/gauge/GaugeAction.cpp index fd92f9d2..20984b98 100644 --- a/src/gauge/GaugeAction.cpp +++ b/src/gauge/GaugeAction.cpp @@ -65,7 +65,7 @@ MemoryAccessor GaugeAction::getRectangleField( template -SQCD_HOST floatT GaugeAction::barePlaquette() { +__host__ floatT GaugeAction::barePlaquette() { if (recompute) { _redBase.template iterateOverBulk( @@ -77,7 +77,7 @@ SQCD_HOST floatT GaugeAction::barePlaquette() } template -SQCD_HOST floatT GaugeAction::barePlaquetteSS() { +__host__ floatT GaugeAction::barePlaquetteSS() { // if (recompute) { _redBase.template iterateOverBulk( @@ -90,7 +90,7 @@ SQCD_HOST floatT GaugeAction::barePlaquetteSS( template -SQCD_HOST floatT GaugeAction::bareUtauMinusUsigma() { +__host__ floatT GaugeAction::bareUtauMinusUsigma() { if (recompute) { _redBase.template iterateOverBulk( @@ -103,7 +103,7 @@ SQCD_HOST floatT GaugeAction::bareUtauMinusUsi template -SQCD_HOST floatT GaugeAction::bareClover() { +__host__ floatT GaugeAction::bareClover() { if (recompute) { _redBase.template iterateOverBulk( @@ -115,7 +115,7 @@ SQCD_HOST floatT GaugeAction::bareClover() { } template -SQCD_HOST floatT GaugeAction::bareRectangle() { +__host__ floatT GaugeAction::bareRectangle() { if (recompute) { _redBase.template iterateOverBulk( diff --git a/src/gauge/GaugeAction.h b/src/gauge/GaugeAction.h index d544de87..11f8d73a 100644 --- a/src/gauge/GaugeAction.h +++ b/src/gauge/GaugeAction.h @@ -24,12 +24,12 @@ class GaugeAction { template MemoryAccessor getField(); - SQCD_HOST floatT barePlaquette(); - SQCD_HOST floatT bareUtauMinusUsigma(); - SQCD_HOST floatT bareClover(); - SQCD_HOST floatT bareRectangle(); + __host__ floatT barePlaquette(); + __host__ floatT bareUtauMinusUsigma(); + __host__ floatT bareClover(); + __host__ floatT bareRectangle(); - SQCD_HOST floatT barePlaquetteSS(); + __host__ floatT barePlaquetteSS(); public: diff --git a/src/gauge/constructs/PlaqConstructs.h b/src/gauge/constructs/PlaqConstructs.h index 1627abee..6d69248f 100644 --- a/src/gauge/constructs/PlaqConstructs.h +++ b/src/gauge/constructs/PlaqConstructs.h @@ -15,7 +15,7 @@ template -HOST_DEVICE GSU3 inline Plaq_P(gaugeAccessor gAcc, gSite site, int mu, int nu) { +__host__ __device__ GSU3 inline Plaq_P(gaugeAccessor gAcc, gSite site, int mu, int nu) { typedef GIndexer GInd; GSU3 temp; @@ -38,7 +38,7 @@ HOST_DEVICE GSU3 inline Plaq_P(gaugeAccessor gAcc, gSite si return temp; } template -HOST_DEVICE GSU3 inline Plaq_Q(gaugeAccessor gAcc, gSite site, int mu, int nu) { +__host__ __device__ GSU3 inline Plaq_Q(gaugeAccessor gAcc, gSite site, int mu, int nu) { typedef GIndexer GInd; GSU3 temp; @@ -62,7 +62,7 @@ HOST_DEVICE GSU3 inline Plaq_Q(gaugeAccessor gAcc, gSite si } template -HOST_DEVICE GSU3 inline Plaq_R(gaugeAccessor gAcc, gSite site, int mu, int nu) { +__host__ __device__ GSU3 inline Plaq_R(gaugeAccessor gAcc, gSite site, int mu, int nu) { typedef GIndexer GInd; GSU3 temp; @@ -85,7 +85,7 @@ HOST_DEVICE GSU3 inline Plaq_R(gaugeAccessor gAcc, gSite si return temp; } template -HOST_DEVICE GSU3 inline Plaq_S(gaugeAccessor gAcc, gSite site, int mu, int nu) { +__host__ __device__ GSU3 inline Plaq_S(gaugeAccessor gAcc, gSite site, int mu, int nu) { typedef GIndexer GInd; GSU3 temp; diff --git a/src/gauge/constructs/derivative3link.h b/src/gauge/constructs/derivative3link.h index 13cccdf4..0290e4b2 100644 --- a/src/gauge/constructs/derivative3link.h +++ b/src/gauge/constructs/derivative3link.h @@ -7,7 +7,7 @@ #include "../../base/math/gaugeAccessor.h" template -HOST_DEVICE GSU3 linkDerivative3(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu) { +__host__ __device__ GSU3 linkDerivative3(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu) { typedef GIndexer GInd; GSU3 temp; gSite origin = site; diff --git a/src/gauge/constructs/derivative5link.h b/src/gauge/constructs/derivative5link.h index 38525557..da578b7e 100644 --- a/src/gauge/constructs/derivative5link.h +++ b/src/gauge/constructs/derivative5link.h @@ -7,7 +7,7 @@ #include "../../base/math/gaugeAccessor.h" template -HOST_DEVICE GSU3 linkDerivative5(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { +__host__ __device__ GSU3 linkDerivative5(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; GSU3 temp; @@ -161,7 +161,7 @@ HOST_DEVICE GSU3 linkDerivative5(gaugeAccessor gAcc, gaug }; template - HOST_DEVICE GSU3 linkDerivative5_1(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + __host__ __device__ GSU3 linkDerivative5_1(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLink(GInd::getSiteMu(GInd::site_up(site,mu), rho)) *gAcc.getLink(GInd::getSiteMu(GInd::site_up_up(site,mu,rho), nu)) @@ -171,7 +171,7 @@ template - HOST_DEVICE GSU3 linkDerivative5_3(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + __host__ __device__ GSU3 linkDerivative5_3(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLinkDagger(GInd::getSiteMu( GInd::site_up_dn(site,mu,rho), rho)) *gAcc.getLinkDagger(GInd::getSiteMu( GInd::site_up_dn_dn(site,mu,nu,rho), nu)) @@ -181,7 +181,7 @@ template - HOST_DEVICE GSU3 linkDerivative5_5(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + __host__ __device__ GSU3 linkDerivative5_5(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLink(GInd::getSiteMu(GInd::site_up(site,mu), rho)) *gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_up_up_dn(site,mu,rho,nu), nu)) @@ -191,7 +191,7 @@ template - HOST_DEVICE GSU3 linkDerivative5_7(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + __host__ __device__ GSU3 linkDerivative5_7(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLinkDagger(GInd::getSiteMu( GInd::site_up_dn(site,mu,rho), rho)) *gAcc.getLink(GInd::getSiteMu( GInd::site_up_dn(site,mu,rho), nu)) @@ -201,7 +201,7 @@ template - HOST_DEVICE GSU3 linkDerivative5_9(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + __host__ __device__ GSU3 linkDerivative5_9(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLink(GInd::getSiteMu(GInd::site_up(site,mu), nu)) *finAccessor.getLink(GInd::getSiteMu(GInd::site_up_up_dn(site,mu,nu,rho), rho)) @@ -211,7 +211,7 @@ template - HOST_DEVICE GSU3 linkDerivative5_11(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + __host__ __device__ GSU3 linkDerivative5_11(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_up_dn(site,mu,nu), nu)) *finAccessor.getLinkDagger(GInd::getSiteMu(GInd::site_up_dn(site,mu,nu), rho)) @@ -221,7 +221,7 @@ template - HOST_DEVICE GSU3 linkDerivative5_13(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + __host__ __device__ GSU3 linkDerivative5_13(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_up_dn(site,mu,nu), nu)) *finAccessor.getLink(GInd::getSiteMu( GInd::site_up_dn_dn(site,mu,nu,rho),rho)) @@ -231,7 +231,7 @@ template - HOST_DEVICE GSU3 linkDerivative5_15(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + __host__ __device__ GSU3 linkDerivative5_15(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLink(GInd::getSiteMu(GInd::site_up(site,mu), nu)) *finAccessor.getLinkDagger(GInd::getSiteMu(GInd::site_up_up(site,mu,nu), rho)) @@ -241,7 +241,7 @@ template - HOST_DEVICE GSU3 linkDerivative5_17(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + __host__ __device__ GSU3 linkDerivative5_17(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return finAccessor.getLink(GInd::getSiteMu(GInd::site_up_dn(site,mu,nu), nu)) *gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_dn(site,nu), mu)) @@ -251,7 +251,7 @@ template - HOST_DEVICE GSU3 linkDerivative5_19(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + __host__ __device__ GSU3 linkDerivative5_19(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return finAccessor.getLinkDagger(GInd::getSiteMu(GInd::site_up(site,mu), nu)) *gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_up(site,nu), mu)) @@ -262,7 +262,7 @@ template - HOST_DEVICE GSU3 linkDerivative5_21(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + __host__ __device__ GSU3 linkDerivative5_21(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return finAccessor.getLink(GInd::getSiteMu(GInd::site_up_dn(site,mu,nu), nu)) *gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_dn(site,nu), mu)) @@ -272,7 +272,7 @@ template - HOST_DEVICE GSU3 linkDerivative5_23(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + __host__ __device__ GSU3 linkDerivative5_23(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return finAccessor.getLinkDagger(GInd::getSiteMu(GInd::site_up(site,mu), nu)) *gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_up(site,nu), mu)) @@ -282,7 +282,7 @@ template - HOST_DEVICE GSU3 linkDerivative5_25(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + __host__ __device__ GSU3 linkDerivative5_25(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLinkDagger(GInd::getSiteMu( GInd::site_up_dn(site,mu,rho), rho)) *gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_dn(site,rho), mu)) @@ -292,7 +292,7 @@ template - HOST_DEVICE GSU3 linkDerivative5_27(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + __host__ __device__ GSU3 linkDerivative5_27(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLink(GInd::getSiteMu(GInd::site_up(site,mu), rho)) *gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_up(site,rho), mu)) @@ -302,7 +302,7 @@ template - HOST_DEVICE GSU3 linkDerivative5_29(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + __host__ __device__ GSU3 linkDerivative5_29(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLinkDagger(GInd::getSiteMu( GInd::site_up_dn(site,mu,rho), rho)) *gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_dn(site,rho), mu)) @@ -312,7 +312,7 @@ template - HOST_DEVICE GSU3 linkDerivative5_31(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + __host__ __device__ GSU3 linkDerivative5_31(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLink(GInd::getSiteMu(GInd::site_up(site,mu), rho)) *gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_up(site,rho), mu)) @@ -322,7 +322,7 @@ template - HOST_DEVICE GSU3 linkDerivative5_33(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + __host__ __device__ GSU3 linkDerivative5_33(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLink(GInd::getSiteMu(GInd::site_up(site,mu), rho)) *gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_up_up_dn(site,mu,rho,nu), nu)) @@ -332,7 +332,7 @@ template - HOST_DEVICE GSU3 linkDerivative5_35(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + __host__ __device__ GSU3 linkDerivative5_35(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLinkDagger(GInd::getSiteMu( GInd::site_up_dn(site,mu,rho), rho)) *gAcc.getLink(GInd::getSiteMu( GInd::site_up_dn(site,mu,rho), nu)) @@ -342,7 +342,7 @@ template - HOST_DEVICE GSU3 linkDerivative5_37(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + __host__ __device__ GSU3 linkDerivative5_37(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLinkDagger(GInd::getSiteMu( GInd::site_up_dn(site,mu,rho), rho)) *gAcc.getLinkDagger(GInd::getSiteMu(GInd::site_up_dn_dn(site,mu,rho,nu), nu)) @@ -352,7 +352,7 @@ template - HOST_DEVICE GSU3 linkDerivative5_39(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { + __host__ __device__ GSU3 linkDerivative5_39(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho) { typedef GIndexer GInd; return gAcc.getLink(GInd::getSiteMu(GInd::site_up(site,mu), rho)) *gAcc.getLink(GInd::getSiteMu(GInd::site_up_up(site,mu,rho), nu)) diff --git a/src/gauge/constructs/derivative7link.h b/src/gauge/constructs/derivative7link.h index c2e1d60c..06f9e318 100644 --- a/src/gauge/constructs/derivative7link.h +++ b/src/gauge/constructs/derivative7link.h @@ -5,7 +5,7 @@ #include "../../base/math/gaugeAccessor.h" template -HOST_DEVICE GSU3 linkDerivative7(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, +__host__ __device__ GSU3 linkDerivative7(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu, int rho, int sigma, int TermCheck = -1, int SubTermCheck = -1) { typedef GIndexer GInd; GSU3 temp = gsu3_zero(); diff --git a/src/gauge/constructs/derivativeLepagelink.h b/src/gauge/constructs/derivativeLepagelink.h index 02198bee..bbda330e 100644 --- a/src/gauge/constructs/derivativeLepagelink.h +++ b/src/gauge/constructs/derivativeLepagelink.h @@ -7,7 +7,7 @@ #include "../../base/math/gaugeAccessor.h" template -HOST_DEVICE GSU3 linkDerivativeLepage(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu) { +__host__ __device__ GSU3 linkDerivativeLepage(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, int nu) { typedef GIndexer GInd; GSU3 temp; gSite origin = site; diff --git a/src/gauge/constructs/derivativeProjectU3Constructs.h b/src/gauge/constructs/derivativeProjectU3Constructs.h index dae1544d..ce457863 100644 --- a/src/gauge/constructs/derivativeProjectU3Constructs.h +++ b/src/gauge/constructs/derivativeProjectU3Constructs.h @@ -11,7 +11,7 @@ #include "gsvd.h" template -HOST_DEVICE GSU3 derivativeProjectU3(gaugeAccessor gAcc, gaugeAccessor fAcc, gSite site, int mu) { +__host__ __device__ GSU3 derivativeProjectU3(gaugeAccessor gAcc, gaugeAccessor fAcc, gSite site, int mu) { typedef GIndexer GInd; diff --git a/src/gauge/constructs/fat7LinkConstructs.h b/src/gauge/constructs/fat7LinkConstructs.h index f1df761e..f63d6958 100644 --- a/src/gauge/constructs/fat7LinkConstructs.h +++ b/src/gauge/constructs/fat7LinkConstructs.h @@ -11,7 +11,7 @@ template - HOST_DEVICE GSU3 inline naikLinkStaple(gaugeAccessor gAcc, gSiteMu siteMu) { + __host__ __device__ GSU3 inline naikLinkStaple(gaugeAccessor gAcc, gSiteMu siteMu) { typedef GIndexer GInd; @@ -24,7 +24,7 @@ template } template - HOST_DEVICE GSU3 inline threeLinkStaple(gaugeAccessor gAcc, gSiteMu siteMu) { + __host__ __device__ GSU3 inline threeLinkStaple(gaugeAccessor gAcc, gSiteMu siteMu) { typedef GIndexer GInd; GSU3 temp = gsu3_zero(); gSite site = GInd::getSite(siteMu.isite); @@ -42,7 +42,7 @@ template } template - HOST_DEVICE GSU3 inline lepageLinkStaple(gaugeAccessor gAcc, gSiteMu siteMu) { + __host__ __device__ GSU3 inline lepageLinkStaple(gaugeAccessor gAcc, gSiteMu siteMu) { typedef GIndexer GInd; GSU3 temp = gsu3_zero(); gSite site = GInd::getSite(siteMu.isite); @@ -61,7 +61,7 @@ template template - HOST_DEVICE GSU3 inline fiveLinkStaple(gaugeAccessor gAcc, gSiteMu siteMu) { + __host__ __device__ GSU3 inline fiveLinkStaple(gaugeAccessor gAcc, gSiteMu siteMu) { typedef GIndexer GInd; GSU3 temp = gsu3_zero(); gSite site = GInd::getSite(siteMu.isite); @@ -122,7 +122,7 @@ template } template - HOST_DEVICE GSU3 inline sevenLinkStaple(gaugeAccessor gAcc, gSiteMu siteMu) { + __host__ __device__ GSU3 inline sevenLinkStaple(gaugeAccessor gAcc, gSiteMu siteMu) { typedef GIndexer GInd; GSU3 temp = gsu3_zero(); gSite site = GInd::getSite(siteMu.isite); diff --git a/src/gauge/constructs/gsvd.h b/src/gauge/constructs/gsvd.h index 028783a4..f1070dbf 100644 --- a/src/gauge/constructs/gsvd.h +++ b/src/gauge/constructs/gsvd.h @@ -89,7 +89,7 @@ * This routine eliminates off-diagonal element, handling special cases * ************************************************************************/ template -HOST_DEVICE inline int svd2x2bidiag(svdfloatT *a00, svdfloatT *a01, svdfloatT *a11, svdfloatT U2[2][2], svdfloatT V2[2][2]) +__host__ __device__ inline int svd2x2bidiag(svdfloatT *a00, svdfloatT *a01, svdfloatT *a11, svdfloatT U2[2][2], svdfloatT V2[2][2]) { register svdfloatT sinphi, cosphi, tanphi, cotphi; register svdfloatT a, b, min, max, abs00, abs01, abs11; @@ -289,7 +289,7 @@ HOST_DEVICE inline int svd2x2bidiag(svdfloatT *a00, svdfloatT *a01, svdfloatT *a template -HOST_DEVICE GSU3 svd3x3core(const GSU3& AA, floatT* sv){ +__host__ __device__ GSU3 svd3x3core(const GSU3& AA, floatT* sv){ /****************************************** * sigma[3] -- singular values, * diff --git a/src/gauge/constructs/hisqForceConstructs.h b/src/gauge/constructs/hisqForceConstructs.h index 772b5ebe..6ab14198 100644 --- a/src/gauge/constructs/hisqForceConstructs.h +++ b/src/gauge/constructs/hisqForceConstructs.h @@ -8,7 +8,7 @@ #include "derivativeLepagelink.h" template - HOST_DEVICE GSU3 smearingForce(gaugeAccessor gAcc, gaugeAccessor finAccessor, + __host__ __device__ GSU3 smearingForce(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, SmearingParameters _smearparam, int TermCheck = -1, int SubTermCheck = -1, bool doL1 = true, bool doL3 = true, bool doL5 = true, bool doL7 = true, bool doLLp = true) { @@ -66,7 +66,7 @@ template - HOST_DEVICE GSU3 threeLinkContribution(gaugeAccessor gAcc, gaugeAccessor finAccessor, + __host__ __device__ GSU3 threeLinkContribution(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, SmearingParameters _smearparam) { typedef GIndexer GInd; floatT c1 =_smearparam._c_1; @@ -81,7 +81,7 @@ template HOST_DEVICE GSU3 lepagelinkContribution(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c_lp) { +template __host__ __device__ GSU3 lepagelinkContribution(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c_lp) { GSU3 derivative_lp = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { int nu = (mu+nu_h)%4; @@ -90,7 +90,7 @@ template HOST_DEVICE GSU3 sevenLinkContribution(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7, int Term, int SubTerm) { +template __host__ __device__ GSU3 sevenLinkContribution(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7, int Term, int SubTerm) { GSU3 sevenlinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { int nu = (mu + nu_h)%4; @@ -103,7 +103,7 @@ template HOST_DEVICE GSU3< return -c7*sevenlinkCont; }; -template HOST_DEVICE GSU3 fiveLinkContribution(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5, int part) { +template __host__ __device__ GSU3 fiveLinkContribution(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5, int part) { typedef GIndexer GInd; GSU3 fivelinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -124,7 +124,7 @@ template HOST_DEVICE GSU3< return finAccessor.getLink(GInd::getSiteMu(site,mu))+c5*fivelinkCont; }; -template HOST_DEVICE GSU3 fiveLinkContribution_11(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { +template __host__ __device__ GSU3 fiveLinkContribution_11(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { typedef GIndexer GInd; GSU3 fivelinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -147,7 +147,7 @@ template HOST_DEVICE GSU3< return c5*fivelinkCont; }; -template HOST_DEVICE GSU3 fiveLinkContribution_12(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { +template __host__ __device__ GSU3 fiveLinkContribution_12(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { typedef GIndexer GInd; GSU3 fivelinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -170,7 +170,7 @@ template HOST_DEVICE GSU3< return c5*fivelinkCont; }; -template HOST_DEVICE GSU3 fiveLinkContribution_13(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { +template __host__ __device__ GSU3 fiveLinkContribution_13(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { typedef GIndexer GInd; GSU3 fivelinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -193,7 +193,7 @@ template HOST_DEVICE GSU3< return c5*fivelinkCont; }; -template HOST_DEVICE GSU3 fiveLinkContribution_14(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { +template __host__ __device__ GSU3 fiveLinkContribution_14(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { typedef GIndexer GInd; GSU3 fivelinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -217,7 +217,7 @@ template HOST_DEVICE GSU3< }; -template HOST_DEVICE GSU3 fiveLinkContribution_20(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { +template __host__ __device__ GSU3 fiveLinkContribution_20(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { typedef GIndexer GInd; GSU3 fivelinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -275,7 +275,7 @@ template HOST_DEVICE GSU3< return c5*fivelinkCont; }; -template HOST_DEVICE GSU3 fiveLinkContribution_30(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { +template __host__ __device__ GSU3 fiveLinkContribution_30(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c5) { typedef GIndexer GInd; GSU3 fivelinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -326,7 +326,7 @@ template HOST_DEVICE GSU3< return c5*fivelinkCont; }; -template HOST_DEVICE GSU3 sevenLinkContribution_1(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { +template __host__ __device__ GSU3 sevenLinkContribution_1(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { typedef GIndexer GInd; GSU3 sevenlinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -403,7 +403,7 @@ template HOST_DEVICE GSU3< return -c7*sevenlinkCont; }; -template HOST_DEVICE GSU3 sevenLinkContribution_2(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { +template __host__ __device__ GSU3 sevenLinkContribution_2(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { typedef GIndexer GInd; GSU3 sevenlinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -476,7 +476,7 @@ template HOST_DEVICE GSU3< return -c7*sevenlinkCont; }; -template HOST_DEVICE GSU3 sevenLinkContribution_3(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { +template __host__ __device__ GSU3 sevenLinkContribution_3(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { typedef GIndexer GInd; GSU3 sevenlinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -551,7 +551,7 @@ template HOST_DEVICE GSU3< -template HOST_DEVICE GSU3 sevenLinkContribution_4(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { +template __host__ __device__ GSU3 sevenLinkContribution_4(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { typedef GIndexer GInd; GSU3 sevenlinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -625,7 +625,7 @@ template HOST_DEVICE GSU3< return -c7*sevenlinkCont; }; -template HOST_DEVICE GSU3 sevenLinkContribution_5(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { +template __host__ __device__ GSU3 sevenLinkContribution_5(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { typedef GIndexer GInd; GSU3 sevenlinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -695,7 +695,7 @@ template HOST_DEVICE GSU3< return -c7*sevenlinkCont; }; -template HOST_DEVICE GSU3 sevenLinkContribution_6(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { +template __host__ __device__ GSU3 sevenLinkContribution_6(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { typedef GIndexer GInd; GSU3 sevenlinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { @@ -764,7 +764,7 @@ template HOST_DEVICE GSU3< return -c7*sevenlinkCont; }; -template HOST_DEVICE GSU3 sevenLinkContribution_7(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { +template __host__ __device__ GSU3 sevenLinkContribution_7(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu, floatT c7) { typedef GIndexer GInd; GSU3 sevenlinkCont = gsu3_zero(); for (int nu_h = 1; nu_h < 4; nu_h++) { diff --git a/src/gauge/constructs/linkLepageConstructs.h b/src/gauge/constructs/linkLepageConstructs.h index 7bfd7863..bfee8028 100644 --- a/src/gauge/constructs/linkLepageConstructs.h +++ b/src/gauge/constructs/linkLepageConstructs.h @@ -14,7 +14,7 @@ template - DEVICE GSU3 inline linkLpUp(gaugeAccessor gAcc, gSite site, int mu, int nu) { + __device__ GSU3 inline linkLpUp(gaugeAccessor gAcc, gSite site, int mu, int nu) { typedef GIndexer GInd; gSite origin = site; @@ -54,7 +54,7 @@ template } template - DEVICE GSU3 inline linkLpDn(gaugeAccessor gAcc, gSite site, int mu, int nu) { + __device__ GSU3 inline linkLpDn(gaugeAccessor gAcc, gSite site, int mu, int nu) { typedef GIndexer GInd; gSite dn = GInd::site_dn(site, nu); gSite twoDn = GInd::site_dn(dn, nu); diff --git a/src/gauge/constructs/linkStaple3Constructs.h b/src/gauge/constructs/linkStaple3Constructs.h index 47b42694..8f6b29b3 100644 --- a/src/gauge/constructs/linkStaple3Constructs.h +++ b/src/gauge/constructs/linkStaple3Constructs.h @@ -16,7 +16,7 @@ template - DEVICE GSU3 inline linkStaple3Up(gaugeAccessor gAcc, gSite site, int mu, int nu) { + __device__ GSU3 inline linkStaple3Up(gaugeAccessor gAcc, gSite site, int mu, int nu) { typedef GIndexer GInd; GSU3 temp; @@ -48,7 +48,7 @@ template } template - DEVICE GSU3 inline linkStaple3Dn(gaugeAccessor gAcc, gSite site, int mu, int nu) { + __device__ GSU3 inline linkStaple3Dn(gaugeAccessor gAcc, gSite site, int mu, int nu) { typedef GIndexer GInd; GSU3 temp; diff --git a/src/gauge/constructs/linkStaple5Constructs.h b/src/gauge/constructs/linkStaple5Constructs.h index a0a6af34..ebce6460 100644 --- a/src/gauge/constructs/linkStaple5Constructs.h +++ b/src/gauge/constructs/linkStaple5Constructs.h @@ -14,7 +14,7 @@ #include "linkStaple3Constructs.h" template - DEVICE GSU3 inline linkStaple5Up(gaugeAccessor gAcc, gSite site, int mu, int nu, int rho, + __device__ GSU3 inline linkStaple5Up(gaugeAccessor gAcc, gSite site, int mu, int nu, int rho, int gamma) { typedef GIndexer GInd; diff --git a/src/gauge/constructs/linkStaple7Constructs.h b/src/gauge/constructs/linkStaple7Constructs.h index d63be220..c5674d4a 100644 --- a/src/gauge/constructs/linkStaple7Constructs.h +++ b/src/gauge/constructs/linkStaple7Constructs.h @@ -15,7 +15,7 @@ #include "linkStaple5Constructs.h" template - DEVICE GSU3 inline linkStaple7Up(gaugeAccessor gAcc,gSite site, int mu, int nu, int rho, int gamma){ + __device__ GSU3 inline linkStaple7Up(gaugeAccessor gAcc,gSite site, int mu, int nu, int rho, int gamma){ typedef GIndexer GInd; GSU3 staple5=gsu3_zero(); diff --git a/src/gauge/constructs/naikConstructs.h b/src/gauge/constructs/naikConstructs.h index 0a8351e5..7dc9c197 100644 --- a/src/gauge/constructs/naikConstructs.h +++ b/src/gauge/constructs/naikConstructs.h @@ -13,7 +13,7 @@ #include "../gaugefield.h" template -DEVICE GSU3 inline naik3LinkUp(gaugeAccessor gAcc, gSite site, int mu) { +__device__ GSU3 inline naik3LinkUp(gaugeAccessor gAcc, gSite site, int mu) { typedef GIndexer GInd; GSU3 temp; diff --git a/src/gauge/constructs/naikDerivativeConstructs.h b/src/gauge/constructs/naikDerivativeConstructs.h index 905c0269..e7e10bd6 100644 --- a/src/gauge/constructs/naikDerivativeConstructs.h +++ b/src/gauge/constructs/naikDerivativeConstructs.h @@ -14,7 +14,7 @@ __device__ inline floatT sgn_naik(gSiteMu siteMu) { }*/ template -HOST_DEVICE GSU3 inline naikLinkDerivative(gaugeAccessor gAcc, +__host__ __device__ GSU3 inline naikLinkDerivative(gaugeAccessor gAcc, gaugeAccessor finAccessor, gSite site, int mu) { typedef GIndexer GInd; diff --git a/src/gauge/constructs/projectU3Constructs.h b/src/gauge/constructs/projectU3Constructs.h index ab3c65f1..237a7422 100644 --- a/src/gauge/constructs/projectU3Constructs.h +++ b/src/gauge/constructs/projectU3Constructs.h @@ -12,7 +12,7 @@ #include "../gaugefield.h" #include "gsvd.h" template -HOST_DEVICE GSU3 inline projectU3(gaugeAccessor gAcc, gSite site, int mu) { +__host__ __device__ GSU3 inline projectU3(gaugeAccessor gAcc, gSite site, int mu) { typedef GIndexer GInd; GSU3 V; diff --git a/src/gauge/gaugeActionDeriv.h b/src/gauge/gaugeActionDeriv.h index 29d474eb..c447eadc 100644 --- a/src/gauge/gaugeActionDeriv.h +++ b/src/gauge/gaugeActionDeriv.h @@ -16,7 +16,7 @@ template -HOST_DEVICE GSU3 inline gaugeActionDerivPlaq(gaugeAccessor gAcc, gSite site, int mu) { +__host__ __device__ GSU3 inline gaugeActionDerivPlaq(gaugeAccessor gAcc, gSite site, int mu) { GSU3 result = gsu3_zero(); GSU3 tmp = gsu3_zero(); @@ -32,7 +32,7 @@ HOST_DEVICE GSU3 inline gaugeActionDerivPlaq(gaugeAccessor gAcc, } template -HOST_DEVICE GSU3 inline gaugeActionDerivRect(gaugeAccessor gAcc, gSite site, int mu) { +__host__ __device__ GSU3 inline gaugeActionDerivRect(gaugeAccessor gAcc, gSite site, int mu) { typedef GIndexer GInd; GSU3 result = gsu3_zero(); GSU3 tmp = gsu3_zero(); @@ -79,7 +79,7 @@ HOST_DEVICE GSU3 inline gaugeActionDerivRect(gaugeAccessor gAcc, } template -HOST_DEVICE GSU3 inline symanzikGaugeActionDeriv(gaugeAccessor latacc, gSite s, int mu) { +__host__ __device__ GSU3 inline symanzikGaugeActionDeriv(gaugeAccessor latacc, gSite s, int mu) { typedef GIndexer GInd; // GSU3 tmp = (5. / 3.) * gaugeActionDerivPlaq(gAcc, site, mu) - // (1. / 12.) * gaugeActionDerivRect(gAcc, site, mu); @@ -167,7 +167,7 @@ HOST_DEVICE GSU3 inline symanzikGaugeActionDeriv(gaugeAccessor l //up to an additional factor of -beta/3 identical to symanikGaugeActionDeriv but faster template -HOST_DEVICE GSU3 inline gauge_force(gaugeAccessor latacc, gSiteMu site, floatT beta){ +__host__ __device__ GSU3 inline gauge_force(gaugeAccessor latacc, gSiteMu site, floatT beta){ typedef GIndexer GInd; diff --git a/src/gauge/gauge_kernels.cpp b/src/gauge/gauge_kernels.cpp index 037aa007..2e652ce6 100644 --- a/src/gauge/gauge_kernels.cpp +++ b/src/gauge/gauge_kernels.cpp @@ -5,7 +5,7 @@ struct plaquetteKernel{ plaquetteKernel(Gaugefield &gauge) : gAcc(gauge.getAccessor()){ } - HOST_DEVICE floatT operator()(gSite site) { + __host__ __device__ floatT operator()(gSite site) { typedef GIndexer GInd; GSU3 temp; @@ -29,7 +29,7 @@ struct plaquetteKernelSS{ plaquetteKernelSS(Gaugefield &gauge) : gAcc(gauge.getAccessor()){ } - HOST_DEVICE floatT operator()(gSite site) { + __host__ __device__ floatT operator()(gSite site) { typedef GIndexer GInd; GSU3 temp; @@ -53,7 +53,7 @@ struct plaquetteKernel_double{ plaquetteKernel_double(Gaugefield &gauge) : gAcc(gauge.getAccessor()){ } - HOST_DEVICE double operator()(gSite site) { + __host__ __device__ double operator()(gSite site) { typedef GIndexer GInd; double result = 0; @@ -77,7 +77,7 @@ struct UtauMinusUsigmaKernel{ UtauMinusUsigmaKernel(Gaugefield &gauge) : gAcc(gauge.getAccessor()){ } - HOST_DEVICE floatT operator()(gSite site) { + __host__ __device__ floatT operator()(gSite site) { typedef GIndexer GInd; GSU3 temp; @@ -106,7 +106,7 @@ struct cloverKernel{ cloverKernel(Gaugefield &gauge) : gAcc(gauge.getAccessor()), FT(gAcc){ } - HOST_DEVICE floatT operator()(gSite site) { + __host__ __device__ floatT operator()(gSite site) { GSU3 Fmunu; @@ -130,7 +130,7 @@ struct rectangleKernel{ rectangleKernel(Gaugefield &gauge) : gAcc(gauge.getAccessor()){ } - HOST_DEVICE floatT operator()(gSite site) { + __host__ __device__ floatT operator()(gSite site) { typedef GIndexer GInd; GSU3 temp; @@ -164,7 +164,7 @@ struct rectangleKernel_double{ rectangleKernel_double(Gaugefield &gauge) : gAcc(gauge.getAccessor()){ } - HOST_DEVICE double operator()(gSite site) { + __host__ __device__ double operator()(gSite site) { typedef GIndexer GInd; GSU3 temp; @@ -204,7 +204,7 @@ struct gaugeActKernel_double{ gaugeActKernel_double(Gaugefield &gauge) : gAcc(gauge.getAccessor()){ } - HOST_DEVICE double operator()(gSite site) { + __host__ __device__ double operator()(gSite site) { typedef GIndexer GInd; GSU3 m_0; @@ -279,7 +279,7 @@ struct count_faulty_links { floatT tol; count_faulty_links(Gaugefield &GaugeL, Gaugefield &GaugeR, floatT tolerance=1e-6) : gL(GaugeL.getAccessor()), gR(GaugeR.getAccessor()), tol(tolerance) {} - HOST_DEVICE int operator() (gSite site) { + __host__ __device__ int operator() (gSite site) { int sum = 0; for (int mu = 0; mu < 4; mu++) { gSiteMu siteMu = GIndexer::getSiteMu(site,mu); diff --git a/src/gauge/gaugefield.h b/src/gauge/gaugefield.h index d210f28a..09ae11e8 100644 --- a/src/gauge/gaugefield.h +++ b/src/gauge/gaugefield.h @@ -139,7 +139,7 @@ struct convert_prec { convert_prec(Gaugefield &gaugeIn) : gAcc_source(gaugeIn.getAccessor()) {} - HOST_DEVICE GSU3 operator()(gSiteMu site) { + __host__ __device__ GSU3 operator()(gSiteMu site) { return gAcc_source.template getLink(site); } }; diff --git a/src/gauge/gaugefield_device.cpp b/src/gauge/gaugefield_device.cpp index de8e3864..0a98eee5 100644 --- a/src/gauge/gaugefield_device.cpp +++ b/src/gauge/gaugefield_device.cpp @@ -16,10 +16,10 @@ struct fill_with_rand GSU3 my_mat; - HOST_DEVICE void initialize(__attribute__((unused)) gSite site){ + __host__ __device__ void initialize(__attribute__((unused)) gSite site){ } - HOST_DEVICE GSU3 operator()(gSite site, __attribute__((unused)) size_t mu){ + __host__ __device__ GSU3 operator()(gSite site, __attribute__((unused)) size_t mu){ my_mat.random(&_rand_state[site.isite]); return my_mat; } @@ -32,10 +32,10 @@ struct fill_with_gauss { GSU3 my_mat; - HOST_DEVICE void initialize(__attribute__((unused)) gSite site) { + __host__ __device__ void initialize(__attribute__((unused)) gSite site) { } - HOST_DEVICE GSU3 operator()(gSite site, __attribute__((unused)) size_t mu) { + __host__ __device__ GSU3 operator()(gSite site, __attribute__((unused)) size_t mu) { my_mat.gauss(&_rand_state[site.isite]); return my_mat; } @@ -47,7 +47,7 @@ struct UnitKernel{ gaugeAccessor gaugeAcc; explicit UnitKernel(Gaugefield& gauge) : gaugeAcc(gauge.getAccessor()){} - HOST_DEVICE GSU3 operator()(gSiteMu siteMu){ + __host__ __device__ GSU3 operator()(gSiteMu siteMu){ typedef GIndexer GInd; GSU3 temp; temp=gaugeAcc.template getLink(siteMu); diff --git a/src/modules/HISQ/staggeredPhases.h b/src/modules/HISQ/staggeredPhases.h index adef8f79..26e78809 100644 --- a/src/modules/HISQ/staggeredPhases.h +++ b/src/modules/HISQ/staggeredPhases.h @@ -8,7 +8,7 @@ struct calcStaggeredPhase { - inline HOST_DEVICE int operator()(const gSiteMu &siteMu) const { + inline __host__ __device__ int operator()(const gSiteMu &siteMu) const { typedef GIndexer GInd; @@ -36,7 +36,7 @@ struct calcStaggeredPhase { * */ struct calcStaggeredBoundary { - inline HOST_DEVICE int operator()(const gSiteMu &siteMu) const { + inline __host__ __device__ int operator()(const gSiteMu &siteMu) const { typedef GIndexer GInd; @@ -51,7 +51,7 @@ struct calcStaggeredBoundary { template struct imagMuphase { - inline HOST_DEVICE GPUcomplex operator()(const gSiteMu &siteMu, double chmp) const { + inline __host__ __device__ GPUcomplex operator()(const gSiteMu &siteMu, double chmp) const { GPUcomplex img_chmp; diff --git a/src/modules/observables/FieldStrengthTensor.h b/src/modules/observables/FieldStrengthTensor.h index 18acbcb4..9ae59f84 100644 --- a/src/modules/observables/FieldStrengthTensor.h +++ b/src/modules/observables/FieldStrengthTensor.h @@ -26,7 +26,7 @@ struct plaqClover { plaqClover(gaugeAccessor acc) : acc(acc) {} - HOST_DEVICE inline GSU3 operator()(gSite site, int mu, int nu) { + __host__ __device__ inline GSU3 operator()(gSite site, int mu, int nu) { return Plaq_P(acc, site, mu, nu) + Plaq_Q(acc, site, mu, nu) @@ -43,7 +43,7 @@ struct rectClover { rectClover(gaugeAccessor acc) : acc(acc) {} - HOST_DEVICE inline GSU3 operator()(gSite site, int mu, int nu) { + __host__ __device__ inline GSU3 operator()(gSite site, int mu, int nu) { gSite origin = site; gSite up = GInd::site_up(site, nu); gSite twoUp = GInd::site_up(up, nu); @@ -154,7 +154,7 @@ struct FieldStrengthTensor { FieldStrengthTensor(gaugeAccessor acc) : acc(acc), plClov(acc) {} - HOST_DEVICE inline GSU3 operator()(gSite site, int mu, int nu) { + __host__ __device__ inline GSU3 operator()(gSite site, int mu, int nu) { //define a unitary matrix for the addition in the end GSU3 unityGSU3 = gsu3_one(); @@ -186,7 +186,7 @@ struct FieldStrengthTensor_imp { FieldStrengthTensor_imp(gaugeAccessor acc) : acc(acc), plClov(acc), rcClov(acc) {} - HOST_DEVICE inline GSU3 operator()(gSite site, int mu, int nu) { + __host__ __device__ inline GSU3 operator()(gSite site, int mu, int nu) { //define a unitary matrix for the addition in the end GSU3 unityGSU3 = gsu3_one(); diff --git a/src/spinor/spinorfield.h b/src/spinor/spinorfield.h index 65b71e61..4d32ea90 100644 --- a/src/spinor/spinorfield.h +++ b/src/spinor/spinorfield.h @@ -17,18 +17,18 @@ #include template -HOST_DEVICE constexpr inline Layout LayoutSwitcher(); +__host__ __device__ constexpr inline Layout LayoutSwitcher(); template <> -HOST_DEVICE constexpr inline Layout LayoutSwitcher() { +__host__ __device__ constexpr inline Layout LayoutSwitcher() { return All; } template <> -HOST_DEVICE constexpr inline Layout LayoutSwitcher() { +__host__ __device__ constexpr inline Layout LayoutSwitcher() { return Even; } template <> -HOST_DEVICE constexpr inline Layout LayoutSwitcher() { +__host__ __device__ constexpr inline Layout LayoutSwitcher() { return Odd; } @@ -443,14 +443,14 @@ auto operator / (Spinorfield struct convert_spinor_precision { - HOST_DEVICE void initialize(__attribute__((unused)) gSite& site){ + __host__ __device__ void initialize(__attribute__((unused)) gSite& site){ //We do not initialize anything } gVect3arrayAcc spinor_source; convert_spinor_precision(Spinorfield &spinorIn) : spinor_source(spinorIn.getAccessor()) {} - HOST_DEVICE auto operator()(gSiteStack site) { + __host__ __device__ auto operator()(gSiteStack site) { return spinor_source.template getElement(site); } From f7cd01abbb44ebcbdef9abbfaa2d359d674c18a6 Mon Sep 17 00:00:00 2001 From: Luis Altenkort Date: Thu, 1 Sep 2022 14:56:43 +0200 Subject: [PATCH 12/14] move CMAKE_CXX_FLAGS outside the conditional --- CMakeLists.txt | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8a80cdd8..7653a1a9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -141,21 +141,16 @@ endif() # Additional compiler flags +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -Wall -Wextra -Wno-comment -fPIC") if (USE_CUDA) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -Wall -Wextra -Wno-comment -fPIC") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 -Wno-deprecated-gpu-targets --std=c++17 -arch=sm_${ARCHITECTURE} -Xcudafe --display_error_number -prec-div=true -prec-sqrt=true") - elseif (USE_HIP_AMD) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -Wall -Wextra -Wno-comment -fPIC -fgpu-rdc") - + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fgpu-rdc") add_definitions(-D__HIP_PLATFORM_AMD__) set(HIP_HIPCC_FLAGS ${HIP_HIPCC_FLAGS} "-O3 -std=c++17 -D__HIP_PLATFORM_AMD__ --amdgpu-target=${ARCHITECTURE} -fgpu-rdc") set(CMAKE_EXE_LINKER_FLAGS "-O3 -fgpu-rdc --hip-link") - elseif (USE_HIP_NVIDIA) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -Wall -Wextra -Wno-comment -fPIC") - add_definitions(-D__HIP_PLATFORM_NVIDIA__ -D__HIP_PLATFORM_NVCC__) set(HIP_NVCC_FLAGS "${HIP_NVCC_FLAGS} -O3 -std=c++17 -D__HIP_PLATFORM_NVCC__ -D__HIP_PLATFORM_NVIDIA__ --generate-code arch=compute_${ARCHITECTURE},code=sm_${ARCHITECTURE} --generate-code arch=compute_${ARCHITECTURE},code=compute_${ARCHITECTURE} -rdc=true") set(NVCC_LINK_FLAGS "${NVCC_LINK_FLAGS} -rdc=true") From 6332cabb7e487e7ea0cde30f0ba57d307a8c3fac Mon Sep 17 00:00:00 2001 From: Luis Altenkort Date: Thu, 1 Sep 2022 15:02:05 +0200 Subject: [PATCH 13/14] fix two find-replace mistakes --- src/explicit_instantiation_macros.h | 2 +- src/gauge/constructs/naikConstructs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/explicit_instantiation_macros.h b/src/explicit_instantiation_macros.h index 20f379d0..d47f12c7 100644 --- a/src/explicit_instantiation_macros.h +++ b/src/explicit_instantiation_macros.h @@ -228,7 +228,7 @@ enum CompressionType { IF(BOOL(NO_GPU)) (FLOAT_LOOP_ALL(INIT_TEMPLATES, false)) #define INIT_ALL(INIT_TEMPLATES) \ - __device___LOOP_ALL(INIT_TEMPLATES) + DEVICE_LOOP_ALL(INIT_TEMPLATES) /// =================== Initialize Precision and Halos ========================================== diff --git a/src/gauge/constructs/naikConstructs.h b/src/gauge/constructs/naikConstructs.h index 7dc9c197..c41915c6 100644 --- a/src/gauge/constructs/naikConstructs.h +++ b/src/gauge/constructs/naikConstructs.h @@ -33,7 +33,7 @@ __device__ GSU3 inline naik3LinkUp(gaugeAccessor gAcc, gSite sit } /* template -DEVICE GSU3 inline naik3LinkDn(gaugeAccessor gAcc, gSite site, int mu) { +__device__ GSU3 inline naik3LinkDn(gaugeAccessor gAcc, gSite site, int mu) { typedef GIndexer GInd; GSU3 temp; From 774615969cc5de92006ce5e66f72b7bef2a2d693 Mon Sep 17 00:00:00 2001 From: lukas-mazur <59335056+lukas-mazur@users.noreply.github.com> Date: Mon, 12 Sep 2022 14:36:03 +0200 Subject: [PATCH 14/14] reverted __host__ <-> __device__ swap --- src/base/LatticeContainer.h | 12 +- src/base/gutils.h | 4 +- src/base/indexer/BulkIndexer.h | 238 +++++++++--------- src/base/indexer/HaloIndexer.h | 153 ++++++----- src/base/math/correlators.h | 12 +- src/base/math/gcomplex.h | 3 +- src/base/math/grnd.h | 9 +- src/base/math/gsu2.h | 62 ++--- src/base/math/gsu3.h | 209 ++++++++------- src/base/math/gvect3.h | 154 ++++++------ src/base/math/matrix4x4.h | 22 +- src/base/math/operators.h | 1 - src/base/math/simpleArray.h | 14 +- src/base/math/su3Exp.h | 4 +- src/base/memoryManagement.h | 4 +- src/base/runFunctors.h | 1 + src/gauge/constructs/gsvd.h | 4 +- src/gauge/gauge_kernels.cpp | 112 ++++----- src/gauge/gaugefield.h | 2 +- src/gauge/gaugefield_device.cpp | 6 +- src/modules/observables/FieldStrengthTensor.h | 8 +- src/spinor/spinorfield.h | 2 +- 22 files changed, 513 insertions(+), 523 deletions(-) diff --git a/src/base/LatticeContainer.h b/src/base/LatticeContainer.h index 0c953eed..058e88f3 100644 --- a/src/base/LatticeContainer.h +++ b/src/base/LatticeContainer.h @@ -47,30 +47,30 @@ class LatticeContainerAccessor : public MemoryAccessor { /// Set values. template - __host__ __device__ inline void setElement(const size_t isite, const floatT value) { + __device__ __host__ inline void setElement(const size_t isite, const floatT value) { auto *arr = reinterpret_cast(Array); arr[isite] = value; } template - __host__ __device__ inline void setElement(const gSite& site, const floatT value) { + __device__ __host__ inline void setElement(const gSite& site, const floatT value) { setValue(site.isite, value); } template - __host__ __device__ inline void setElement(const gSiteStack& site, const floatT value) { + __device__ __host__ inline void setElement(const gSiteStack& site, const floatT value) { setValue(site.isiteStack, value); } /// Get values. template - __host__ __device__ floatT getElement(const gSite& site) { + __device__ __host__ floatT getElement(const gSite& site) { return getElement(site.isite); } template - __host__ __device__ floatT getElement(const gSiteStack& site) { + __device__ __host__ floatT getElement(const gSiteStack& site) { return getElement(site.isiteStack); } template - __host__ __device__ inline floatT getElement(const size_t isite) { + __device__ __host__ inline floatT getElement(const size_t isite) { auto *arr = reinterpret_cast(Array); return arr[isite]; } diff --git a/src/base/gutils.h b/src/base/gutils.h index 28385acf..9e176fc2 100644 --- a/src/base/gutils.h +++ b/src/base/gutils.h @@ -54,7 +54,9 @@ class GpuError { /** * Utility method for speedy testing of whether a number is odd */ -__host__ __device__ inline bool isOdd(int cand) { return (cand & 0x1); } +__device__ __host__ inline bool isOdd(int cand) { return (cand & 0x1); } + + #endif /* UTIL_H */ diff --git a/src/base/indexer/BulkIndexer.h b/src/base/indexer/BulkIndexer.h index 85c7c52a..872aec61 100644 --- a/src/base/indexer/BulkIndexer.h +++ b/src/base/indexer/BulkIndexer.h @@ -30,8 +30,8 @@ struct sitexyzt { int y; int z; int t; - __host__ __device__ sitexyzt(int x, int y, int z, int t) : x(x), y(y), z(z), t(t) {}; - __host__ __device__ inline int& operator[](const int i) { + __device__ __host__ sitexyzt(int x, int y, int z, int t) : x(x), y(y), z(z), t(t) {}; + __device__ __host__ inline int& operator[](const int i) { if(i == 0) return x; if(i == 1) return y; if(i == 2) return z; @@ -58,9 +58,9 @@ struct gSite { sitexyzt coord, coordFull; // These constructors should only be called from GIndexer. - __host__ __device__ inline gSite() : isite(0), isiteFull(0), coord(0, 0, 0, 0), coordFull(0, 0, 0, 0) {} + __device__ __host__ inline gSite() : isite(0), isiteFull(0), coord(0, 0, 0, 0), coordFull(0, 0, 0, 0) {} - __host__ __device__ inline gSite(size_t isite, size_t isiteFull, sitexyzt coord, sitexyzt coordFull) : + __device__ __host__ inline gSite(size_t isite, size_t isiteFull, sitexyzt coord, sitexyzt coordFull) : isite(isite), isiteFull(isiteFull), coord(coord), coordFull(coordFull) {}; @@ -85,13 +85,13 @@ struct gSiteStack : public gSite { size_t isiteStackFull; size_t stack; - __host__ __device__ gSiteStack() : gSite(), isiteStack(0), isiteStackFull(0), stack(0){} + __device__ __host__ gSiteStack() : gSite(), isiteStack(0), isiteStackFull(0), stack(0){} - __host__ __device__ gSiteStack(size_t isite, size_t isiteFull, sitexyzt coord, + __device__ __host__ gSiteStack(size_t isite, size_t isiteFull, sitexyzt coord, sitexyzt coordFull, size_t isiteStack, size_t isiteStackFull, size_t stack) : gSite(isite, isiteFull, coord, coordFull), isiteStack(isiteStack), isiteStackFull(isiteStackFull), stack(stack){} - __host__ __device__ gSiteStack(gSite site, size_t isiteStack, size_t isiteStackFull, size_t stack) : + __device__ __host__ gSiteStack(gSite site, size_t isiteStack, size_t isiteStackFull, size_t stack) : gSite(site), isiteStack(isiteStack), isiteStackFull(isiteStackFull), stack(stack){} gSiteStack(const gSite) = delete; @@ -110,13 +110,13 @@ struct gSiteMu : public gSite { // Link direction. uint8_t mu; - __host__ __device__ gSiteMu() : gSite(), indexMuFull(0), mu(0){} + __device__ __host__ gSiteMu() : gSite(), indexMuFull(0), mu(0){} - __host__ __device__ gSiteMu(size_t isite, size_t isiteFull, sitexyzt coord, + __device__ __host__ gSiteMu(size_t isite, size_t isiteFull, sitexyzt coord, sitexyzt coordFull, size_t indexMuFull, uint8_t mu) : gSite(isite, isiteFull, coord, coordFull), indexMuFull(indexMuFull), mu(mu){} - __host__ __device__ gSiteMu(gSite site, size_t indexMuFull, uint8_t mu) + __device__ __host__ gSiteMu(gSite site, size_t indexMuFull, uint8_t mu) : gSite(site), indexMuFull(indexMuFull), mu(mu) {} gSiteMu(const gSite) = delete; @@ -225,7 +225,7 @@ struct LatticeData { gPosZ(_gPosZ), gPosT(_gPosT) {} - __host__ __device__ sitexyzt globalPos(sitexyzt n) { + __device__ __host__ sitexyzt globalPos(sitexyzt n) { sitexyzt coord = sitexyzt(gPosX + n.x,gPosY + n.y,gPosZ + n.z,gPosT + n.t); @@ -237,7 +237,7 @@ struct LatticeData { return coord; } - __host__ __device__ bool isLocal(sitexyzt globalsite){ + __device__ __host__ bool isLocal(sitexyzt globalsite){ //! make sure globalsite is valid, i.e. not negative or greater than lattice extents! // consider lattice 20 20 20 20 with split 2 2 1 1 @@ -285,7 +285,7 @@ struct LatticeData { return LatticeDimensions(lx,ly,lz,lt); } - __host__ __device__ sitexyzt globalLatticeXYZT() { + __device__ __host__ sitexyzt globalLatticeXYZT() { return sitexyzt(globLX,globLY,globLZ,globLT); } @@ -313,8 +313,8 @@ void initIndexer(const size_t HaloDepth, const LatticeParameters ¶m, Communi template class GIndexer { public: - __host__ __device__ GIndexer() = default; - __host__ __device__ inline static LatticeData getLatData() { + __device__ __host__ GIndexer() = default; + __device__ __host__ inline static LatticeData getLatData() { #ifdef __GPU_ARCH__ return globLatDataGPU[HaloDepth]; @@ -325,7 +325,7 @@ class GIndexer { /// ---------------------------------------------------------------------------------------------------- getSite* /// BULK (NO HALOS) - __host__ __device__ inline static gSite getSite(size_t isite) { + __device__ __host__ inline static gSite getSite(size_t isite) { sitexyzt coord(0, 0, 0, 0); sitexyzt coordFull(0, 0, 0, 0); size_t isiteFull = 0; @@ -344,10 +344,10 @@ class GIndexer { } return gSite(isite, isiteFull, coord, coordFull); } - __host__ __device__ inline static gSite getSite(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { + __device__ __host__ inline static gSite getSite(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { return getSite(_blockDim.x * _blockIdx.x + _threadIdx.x); } - __host__ __device__ inline static gSite getSite(int x, int y, int z, int t) { + __device__ __host__ inline static gSite getSite(int x, int y, int z, int t) { sitexyzt coord = sitexyzt(x, y, z, t); sitexyzt coordFull = coordToFullCoord(coord); size_t isite = 0; @@ -362,7 +362,7 @@ class GIndexer { } return gSite(isite, isiteFull, coord, coordFull); } - __host__ __device__ inline static gSite getSite(sitexyzt coord) { + __device__ __host__ inline static gSite getSite(sitexyzt coord) { return getSite(coord.x,coord.y,coord.z,coord.t); } @@ -370,7 +370,7 @@ class GIndexer { happen whenever you call a kernel running over spacelike indices only. All coordinates will be of the form (x, y, z, 0). The indices isite and isiteFull will by bounded by their respective 3-volumes. The indexing needs to change, because there are fewer sites than with the full bulk.*/ - __host__ __device__ inline static gSite getSiteSpatial(size_t isite) { + __device__ __host__ inline static gSite getSiteSpatial(size_t isite) { sitexyzt coord(0, 0, 0, 0); sitexyzt coordFull(0, 0, 0, 0); size_t isiteFull = 0; @@ -389,11 +389,10 @@ class GIndexer { } return gSite(isite, isiteFull, coord, coordFull); } - - __host__ __device__ inline static gSite getSiteSpatial(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { + __device__ __host__ inline static gSite getSiteSpatial(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { return getSiteSpatial(_blockDim.x * _blockIdx.x + _threadIdx.x); } - __host__ __device__ inline static gSite getSiteSpatial(int x, int y, int z, int t) { + __device__ __host__ inline static gSite getSiteSpatial(int x, int y, int z, int t) { // There is probably a way to allow t>0. My worry right now is that there is that if you allow // t>0, there is no longer a one-to-one correspondence between isite and coord. sitexyzt coord = sitexyzt(x, y, z, t); @@ -411,7 +410,7 @@ class GIndexer { } /// FULL (WITH HALOS) - __host__ __device__ inline static gSite getSiteFull(size_t isiteFull) { + __device__ __host__ inline static gSite getSiteFull(size_t isiteFull) { sitexyzt coord(0, 0, 0, 0); sitexyzt coordFull(0, 0, 0, 0); size_t isite = 0; @@ -430,12 +429,10 @@ class GIndexer { } return gSite(isite, isiteFull, coord, coordFull); } - - __host__ __device__ inline static gSite getSiteFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { + __device__ __host__ inline static gSite getSiteFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { return getSiteFull(_blockDim.x * _blockIdx.x + _threadIdx.x); } - - __host__ __device__ inline static gSite getSiteFull(int x, int y, int z, int t) { + __device__ __host__ inline static gSite getSiteFull(int x, int y, int z, int t) { sitexyzt coordFull = sitexyzt(x, y, z, t); sitexyzt coord = fullCoordToCoord(coordFull); size_t isite = 0; @@ -449,11 +446,11 @@ class GIndexer { } return gSite(isite, isiteFull, coord, coordFull); } - __host__ __device__ inline static gSite getSiteFull(sitexyzt coordfull) { + __device__ __host__ inline static gSite getSiteFull(sitexyzt coordfull) { return getSiteFull(coordfull.x,coordfull.y,coordfull.z,coordfull.t); } - __host__ __device__ inline static gSite getSiteSpatialFull(size_t isiteFull) { + __device__ __host__ inline static gSite getSiteSpatialFull(size_t isiteFull) { sitexyzt coord(0, 0, 0, 0); sitexyzt coordFull(0, 0, 0, 0); size_t isite = 0; @@ -472,11 +469,10 @@ class GIndexer { } return gSite(isite, isiteFull, coord, coordFull); } - - __host__ __device__ inline static gSite getSiteSpatialFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { + __device__ __host__ inline static gSite getSiteSpatialFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx) { return getSiteSpatialFull(_blockDim.x * _blockIdx.x + _threadIdx.x); } - __host__ __device__ inline static gSite getSiteSpatialFull(int x, int y, int z, int t) { + __device__ __host__ inline static gSite getSiteSpatialFull(int x, int y, int z, int t) { sitexyzt coordFull = sitexyzt(x, y, z, t); sitexyzt coord = fullCoordToCoord(coordFull); size_t isite = 0; @@ -495,62 +491,62 @@ class GIndexer { /// BULK (NO HALOS) //! two helper functions for getSiteMu* - __host__ __device__ inline static size_t coordMuToIndexMu_Full(const int x, const int y, const int z, const int t, const int mu) { + __device__ __host__ inline static size_t coordMuToIndexMu_Full(const int x, const int y, const int z, const int t, const int mu) { return (((x + y*getLatData().vol1Full + z*getLatData().vol2Full + t*getLatData().vol3Full) >> 0x1) // integer division by two +getLatData().sizehFull*((x + y + z + t) & 0x1) // 0 if x+y+z+t is even, 1 if it is odd + mu*getLatData().vol4Full); } - __host__ __device__ inline static size_t indexMu_Full(const gSite site, const int mu) { + __device__ __host__ inline static size_t indexMu_Full(const gSite site, const int mu) { return coordMuToIndexMu_Full(site.coordFull.x, site.coordFull.y, site.coordFull.z, site.coordFull.t, mu); } - __host__ __device__ inline static gSiteMu getSiteMu(size_t isite, size_t mu) { + __device__ __host__ inline static gSiteMu getSiteMu(size_t isite, size_t mu) { gSite site(getSite(isite)); size_t indexmufull = indexMu_Full(site, mu); return gSiteMu(site, indexmufull, mu); } - __host__ __device__ inline static gSiteMu getSiteMu(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, size_t mu){ + __device__ __host__ inline static gSiteMu getSiteMu(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, size_t mu){ return getSiteMu(_blockDim.x * _blockIdx.x + _threadIdx.x, mu); } - __host__ __device__ inline static gSiteMu getSiteMu(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ + __device__ __host__ inline static gSiteMu getSiteMu(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ //! It gets the mu index from the y direction of the block. return getSiteMu(_blockDim.x * _blockIdx.x + _threadIdx.x, _threadIdx.y); } - __host__ __device__ inline static gSiteMu getSiteMu(gSite site, size_t mu) { + __device__ __host__ inline static gSiteMu getSiteMu(gSite site, size_t mu) { size_t indexmufull = indexMu_Full(site, mu); return gSiteMu(site, indexmufull, mu); } - __host__ __device__ inline static gSiteMu getSiteMu(int x, int y, int z, int t, size_t mu){ + __device__ __host__ inline static gSiteMu getSiteMu(int x, int y, int z, int t, size_t mu){ return getSiteMu(getSite(x, y, z, t), mu); } - __host__ __device__ inline static gSiteMu getSiteSpatialMu(size_t isite, size_t mu) { + __device__ __host__ inline static gSiteMu getSiteSpatialMu(size_t isite, size_t mu) { gSite site(getSiteSpatial(isite)); size_t indexmufull = indexMu_Full(site, mu); return gSiteMu(site, indexmufull, mu); } /// FULL (WITH HALOS) - __host__ __device__ inline static gSiteMu getSiteMuFull(size_t isiteFull, size_t mu) { + __device__ __host__ inline static gSiteMu getSiteMuFull(size_t isiteFull, size_t mu) { gSite site(getSiteFull(isiteFull)); size_t indexmufull = indexMu_Full(site, mu); return gSiteMu(site, indexmufull, mu); } - __host__ __device__ inline static gSiteMu getSiteMuFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, size_t mu){ + __device__ __host__ inline static gSiteMu getSiteMuFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, size_t mu){ return getSiteMuFull(_blockDim.x * _blockIdx.x + _threadIdx.x, mu); } - __host__ __device__ inline static gSiteMu getSiteMuFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ + __device__ __host__ inline static gSiteMu getSiteMuFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ //!get the mu index from the y direction of the block. return getSiteMuFull(_blockDim.x * _blockIdx.x + _threadIdx.x, _threadIdx.y); } - __host__ __device__ inline static gSiteMu getSiteMuFull(int x, int y, int z, int t, size_t mu){ + __device__ __host__ inline static gSiteMu getSiteMuFull(int x, int y, int z, int t, size_t mu){ return getSiteMu(getSiteFull(x, y, z, t), mu); } /// --------------------------------------------------------------------------------------------------- getSiteStack /// BULK (NO HALOS) - __host__ __device__ inline static gSiteStack getSiteStack(const gSite& site, const size_t stack){ + __device__ __host__ inline static gSiteStack getSiteStack(const gSite& site, const size_t stack){ size_t isiteStack; size_t isiteStackFull; if (LatLayout == All) { @@ -563,24 +559,23 @@ class GIndexer { gSiteStack ret(site, isiteStack, isiteStackFull, stack); return ret; } - __host__ __device__ inline static gSiteStack getSiteStack(const size_t isite, const size_t stack){ + __device__ __host__ inline static gSiteStack getSiteStack(const size_t isite, const size_t stack){ return getSiteStack(getSite(isite), stack); } - - __host__ __device__ inline static gSiteStack getSiteStack(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ + __device__ __host__ inline static gSiteStack getSiteStack(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ return getSiteStack(_blockDim.x * _blockIdx.x + _threadIdx.x, _threadIdx.y); } - __host__ __device__ inline static gSiteStack getSiteStack(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, const size_t stack){ + __device__ __host__ inline static gSiteStack getSiteStack(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, const size_t stack){ return getSiteStack(_blockDim.x * _blockIdx.x + _threadIdx.x, stack); } - __host__ __device__ inline static gSiteStack getSiteStack(int x, int y, int z, int t, int stack) { + __device__ __host__ inline static gSiteStack getSiteStack(int x, int y, int z, int t, int stack) { return getSiteStack(getSite(x, y, z, t), stack); } - __host__ __device__ inline static gSiteStack getSiteStack(sitexyzt coord, int stack) { + __device__ __host__ inline static gSiteStack getSiteStack(sitexyzt coord, int stack) { return getSiteStack(getSite(coord.x, coord.y, coord.z, coord.t), stack); } - __host__ __device__ inline static gSiteStack getSiteStackOdd(const gSite& site, const size_t stack){ + __device__ __host__ inline static gSiteStack getSiteStackOdd(const gSite& site, const size_t stack){ size_t isiteStack; size_t isiteStackFull; if (LatLayout == All) { @@ -593,60 +588,59 @@ class GIndexer { gSiteStack ret(site, isiteStack, isiteStackFull, stack); return ret; } - __host__ __device__ inline static gSiteStack getSiteStackOdd(const size_t isite, const size_t stack){ + __device__ __host__ inline static gSiteStack getSiteStackOdd(const size_t isite, const size_t stack){ return getSiteStackOdd(getSite(isite), stack); } - - __host__ __device__ inline static gSiteStack getSiteStackOdd(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ + __device__ __host__ inline static gSiteStack getSiteStackOdd(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ return getSiteStackOdd(_blockDim.x * _blockIdx.x + _threadIdx.x, _threadIdx.y); } - __host__ __device__ inline static gSiteStack getSiteStackOdd(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, const size_t stack){ + __device__ __host__ inline static gSiteStack getSiteStackOdd(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, const size_t stack){ return getSiteStackOdd(_blockDim.x * _blockIdx.x + _threadIdx.x, stack); } /// FULL (WITH HALOS) - __host__ __device__ inline static gSiteStack getSiteStackFull(const size_t isiteFull, const size_t stack){ + __device__ __host__ inline static gSiteStack getSiteStackFull(const size_t isiteFull, const size_t stack){ return getSiteStack(getSiteFull(isiteFull), stack); } - __host__ __device__ inline static gSiteStack getSiteStackFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ + __device__ __host__ inline static gSiteStack getSiteStackFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx){ gSiteStack ret = getSiteStackFull(_blockDim.x * _blockIdx.x + _threadIdx.x, _threadIdx.y); return ret; } - __host__ __device__ inline static gSiteStack getSiteStackFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, const size_t stack){ + __device__ __host__ inline static gSiteStack getSiteStackFull(const dim3& _blockDim, const uint3& _blockIdx, const uint3& _threadIdx, const size_t stack){ gSiteStack ret = getSiteStackFull(_blockDim.x * _blockIdx.x + _threadIdx.x, stack); return ret; } - __host__ __device__ inline static gSiteStack getSiteStackFull(int x, int y, int z, int t, int stack) { + __device__ __host__ inline static gSiteStack getSiteStackFull(int x, int y, int z, int t, int stack) { return getSiteStack(getSiteFull(x, y, z, t), stack); } /// ----------------------------------------------------------------------------------- CONVERT BETWEEN EVEN AND ODD - template __host__ __device__ inline static gSite convertSite(const gSite& site){ + template __device__ __host__ inline static gSite convertSite(const gSite& site){ return GIndexer::getSite(site.coord.x, site.coord.y, site.coord.z, site.coord.t); } - template __host__ __device__ inline static gSiteMu convertSite(const gSiteMu& site){ + template __device__ __host__ inline static gSiteMu convertSite(const gSiteMu& site){ return GIndexer::getSiteMu(site.coord.x, site.coord.y, site.coord.z, site.coord.t, site.mu); } - template __host__ __device__ inline static gSiteStack convertSite(const gSiteStack& site){ + template __device__ __host__ inline static gSiteStack convertSite(const gSiteStack& site){ return GIndexer::getSiteStack(site.coord.x, site.coord.y, site.coord.z, site.coord.t, site.stack); } //! Given an Even/Odd gSite object, this returns an All gSite object. - __host__ __device__ inline static gSite convertToAll(gSite& site) { + __device__ __host__ inline static gSite convertToAll(gSite& site) { size_t isite = site.isite + (LatLayout == Odd)*getLatData().sizeh; size_t isiteFull = site.isiteFull + (LatLayout == Odd)*getLatData().sizehFull; return gSite(isite, isiteFull, site.coord, site.coordFull); } /// ------------------------------------------------ CONVERT BETWEEN BULK SPACETIME COORDINATES AND FULL COORDINATES - __host__ __device__ inline static sitexyzt coordToFullCoord(sitexyzt coord) { + __device__ __host__ inline static sitexyzt coordToFullCoord(sitexyzt coord) { coord.x += getLatData().HaloDepth[0]; coord.y += getLatData().HaloDepth[1]; coord.z += getLatData().HaloDepth[2]; coord.t += getLatData().HaloDepth[3]; return coord; } - __host__ __device__ inline static sitexyzt fullCoordToCoord(sitexyzt fullCoord) { + __device__ __host__ inline static sitexyzt fullCoordToCoord(sitexyzt fullCoord) { fullCoord.x -= getLatData().HaloDepth[0]; fullCoord.y -= getLatData().HaloDepth[1]; fullCoord.z -= getLatData().HaloDepth[2]; @@ -654,7 +648,7 @@ class GIndexer { return fullCoord; } - __host__ __device__ inline static sitexyzt globalCoordToLocalCoord(sitexyzt coord) { + __device__ __host__ inline static sitexyzt globalCoordToLocalCoord(sitexyzt coord) { coord.x -= getLatData().gPosX; coord.y -= getLatData().gPosY; coord.z -= getLatData().gPosZ; @@ -664,43 +658,43 @@ class GIndexer { /// -------------------------------------------------------------------- CONVERT SPACETIME COORDINATES TO DATA INDEX /// BULK (NO HALOS) - __host__ __device__ inline static size_t coordToIndex_Bulk(const sitexyzt coord) { + __device__ __host__ inline static size_t coordToIndex_Bulk(const sitexyzt coord) { return (((coord.x + coord.y*getLatData().vol1 + coord.z*getLatData().vol2 + coord.t*getLatData().vol3) >> 0x1) // integer division by two +getLatData().sizeh * ((coord.x + coord.y + coord.z + coord.t) & 0x1)); // 0 if x+y+z+t is even, 1 if it is odd } - __host__ __device__ inline static size_t coordToIndex_Bulk_eo(const sitexyzt coord) { + __device__ __host__ inline static size_t coordToIndex_Bulk_eo(const sitexyzt coord) { return ((coord.x + coord.y*getLatData().vol1 + coord.z*getLatData().vol2 + coord.t*getLatData().vol3) >> 0x1); } - __host__ __device__ inline static size_t coordToIndex_SpatialBulk(const sitexyzt coord) { + __device__ __host__ inline static size_t coordToIndex_SpatialBulk(const sitexyzt coord) { return (((coord.x + coord.y*getLatData().vol1 + coord.z*getLatData().vol2) >> 0x1) + getLatData().vol3h*((coord.x + coord.y + coord.z) & 0x1)); } - __host__ __device__ inline static size_t coordToIndex_SpatialBulk_eo(const sitexyzt coord) { + __device__ __host__ inline static size_t coordToIndex_SpatialBulk_eo(const sitexyzt coord) { return ((coord.x + coord.y*getLatData().vol1 + coord.z*getLatData().vol2) >> 0x1); } /// FULL (WITH HALOS) - __host__ __device__ inline static size_t coordToIndex_Full(const sitexyzt coordFull) { + __device__ __host__ inline static size_t coordToIndex_Full(const sitexyzt coordFull) { return (((coordFull.x + coordFull.y*getLatData().vol1Full + coordFull.z*getLatData().vol2Full + coordFull.t*getLatData().vol3Full) >> 0x1) + getLatData().sizehFull*((coordFull.x + coordFull.y + coordFull.z + coordFull.t) & 0x1)); } - __host__ __device__ inline static size_t coordToIndex_Full_eo(const sitexyzt coordFull) { + __device__ __host__ inline static size_t coordToIndex_Full_eo(const sitexyzt coordFull) { return ((coordFull.x + coordFull.y * getLatData().vol1Full + coordFull.z * getLatData().vol2Full + coordFull.t * getLatData().vol3Full) >> 0x1); } - __host__ __device__ inline static size_t coordToIndex_SpatialFull(const sitexyzt coordFull) { + __device__ __host__ inline static size_t coordToIndex_SpatialFull(const sitexyzt coordFull) { return (((coordFull.x + coordFull.y*getLatData().vol1Full + coordFull.z*getLatData().vol2Full) >> 0x1) + getLatData().vol3hFull*((coordFull.x + coordFull.y + coordFull.z) & 0x1)); } - __host__ __device__ inline static size_t coordToIndex_SpatialFull_eo(const sitexyzt coordFull) { + __device__ __host__ inline static size_t coordToIndex_SpatialFull_eo(const sitexyzt coordFull) { return ((coordFull.x + coordFull.y*getLatData().vol1Full + coordFull.z*getLatData().vol2Full) >> 0x1); } @@ -713,7 +707,7 @@ class GIndexer { /// -------------------------------------------------------------------- CONVERT DATA INDEX TO SPACETIME COORDINATES /// BULK (NO HALOS) - __host__ __device__ inline static sitexyzt indexToCoord(const size_t site) { + __device__ __host__ inline static sitexyzt indexToCoord(const size_t site) { int x, y, z, t; int par, normInd, tmp; @@ -751,7 +745,7 @@ class GIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt indexToCoord_eo(const size_t site, int par) { + __device__ __host__ inline static sitexyzt indexToCoord_eo(const size_t site, int par) { int x, y, z, t; int tmp; // double site @@ -769,7 +763,7 @@ class GIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt indexToCoord_Spatial(const size_t site) { + __device__ __host__ inline static sitexyzt indexToCoord_Spatial(const size_t site) { int x, y, z, t; int par, normInd, tmp; @@ -788,7 +782,7 @@ class GIndexer { return sitexyzt(x,y,z,t); } - __host__ __device__ inline static sitexyzt indexToCoord_Spatial_eo(const size_t site, int par) { + __device__ __host__ inline static sitexyzt indexToCoord_Spatial_eo(const size_t site, int par) { int x, y, z, t; int tmp; size_t sited = site << 0x1; @@ -806,7 +800,7 @@ class GIndexer { return sitexyzt(x, y, z, t); } /// FULL (WITH HALOS) - __host__ __device__ inline static sitexyzt indexToCoord_Full(const size_t siteFull) { + __device__ __host__ inline static sitexyzt indexToCoord_Full(const size_t siteFull) { int x, y, z, t; int par, normInd, tmp; @@ -826,7 +820,7 @@ class GIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt indexToCoord_SpatialFull(const size_t siteFull) { + __device__ __host__ inline static sitexyzt indexToCoord_SpatialFull(const size_t siteFull) { int x, y, z, t; int par, normInd, tmp; @@ -846,7 +840,7 @@ class GIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt indexToCoord_Full_eo(const size_t siteFull, int par) { + __device__ __host__ inline static sitexyzt indexToCoord_Full_eo(const size_t siteFull, int par) { int x, y, z, t; int tmp; @@ -863,7 +857,7 @@ class GIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt indexToCoord_SpatialFull_eo(const size_t siteFull, int par) { + __device__ __host__ inline static sitexyzt indexToCoord_SpatialFull_eo(const size_t siteFull, int par) { int x, y, z, t; int tmp; @@ -884,7 +878,7 @@ class GIndexer { //! This function is needed when one wants to have the sites time ordered. For example if one wants to reduce only //! values on each timeslice. - __host__ __device__ inline static size_t siteTimeOrdered(const gSite &site) { + __device__ __host__ inline static size_t siteTimeOrdered(const gSite &site) { sitexyzt c = site.coord; return c.x + c.y*getLatData().vol1 + c.z*getLatData().vol2 + c.t*getLatData().vol3; } @@ -896,19 +890,19 @@ class GIndexer { //! time, this means you cannot pass these functions a dynamic argument. /// --------------------------------------------------------------------------------------- site_move: ONE DIRECTION - template __host__ __device__ inline static gSite site_move(const gSite &s, const int mu) { + template __device__ __host__ inline static gSite site_move(const gSite &s, const int mu) { sitexyzt tmp = site_move(s.coordFull, mu); return getSiteFull(tmp.x, tmp.y, tmp.z, tmp.t); } - template __host__ __device__ inline static gSiteMu site_move(const gSiteMu &s, const int mu) { + template __device__ __host__ inline static gSiteMu site_move(const gSiteMu &s, const int mu) { sitexyzt tmp = site_move(s.coordFull, mu); return getSiteMuFull(tmp.x, tmp.y, tmp.z, tmp.t, s.mu); } - template __host__ __device__ inline static gSiteStack site_move(const gSiteStack &s, const int mu) { + template __device__ __host__ inline static gSiteStack site_move(const gSiteStack &s, const int mu) { sitexyzt tmp = site_move(s.coordFull, mu); return getSiteStackFull(tmp.x, tmp.y, tmp.z, tmp.t, s.stack); } - template __host__ __device__ inline static sitexyzt site_move(sitexyzt s, const int mu) { + template __device__ __host__ inline static sitexyzt site_move(sitexyzt s, const int mu) { int x = s.x; int y = s.y; @@ -975,19 +969,19 @@ class GIndexer { } /// -------------------------------------------------------------------------------------- site_move: TWO DIRECTIONS - template __host__ __device__ inline static gSite site_move(const gSite &s, const int mu, const int nu) { + template __device__ __host__ inline static gSite site_move(const gSite &s, const int mu, const int nu) { sitexyzt tmp = site_move(s.coordFull, mu, nu); return getSiteFull(tmp.x, tmp.y, tmp.z, tmp.t); } - template __host__ __device__ inline static gSiteMu site_move(const gSiteMu &s, const int mu, const int nu) { + template __device__ __host__ inline static gSiteMu site_move(const gSiteMu &s, const int mu, const int nu) { sitexyzt tmp = site_move(s.coordFull, mu, nu); return getSiteMuFull(tmp.x, tmp.y, tmp.z, tmp.t, s.mu); } - template __host__ __device__ inline static gSiteStack site_move(const gSiteStack &s, const int mu, const int nu) { + template __device__ __host__ inline static gSiteStack site_move(const gSiteStack &s, const int mu, const int nu) { sitexyzt tmp = site_move(s.coordFull, mu, nu); return getSiteStackFull(tmp.x, tmp.y, tmp.z, tmp.t, s.stack); } - template __host__ __device__ inline static sitexyzt site_move(const sitexyzt &s, const int mu, const int nu) { + template __device__ __host__ inline static sitexyzt site_move(const sitexyzt &s, const int mu, const int nu) { int x = s.x; int y = s.y; int z = s.z; @@ -1110,22 +1104,22 @@ class GIndexer { /// ------------------------------------------------------------------------------------ site_move: THREE DIRECTIONS template - __host__ __device__ inline static gSite site_move(const gSite &s, const int mu, const int nu, const int rho) { + __device__ __host__ inline static gSite site_move(const gSite &s, const int mu, const int nu, const int rho) { sitexyzt tmp = site_move(s.coordFull, mu, nu, rho); return getSiteFull(tmp.x, tmp.y, tmp.z, tmp.t); } template - __host__ __device__ inline static gSiteMu site_move(const gSiteMu &s, const int mu, const int nu, const int rho) { + __device__ __host__ inline static gSiteMu site_move(const gSiteMu &s, const int mu, const int nu, const int rho) { sitexyzt tmp = site_move(s.coordFull, mu, nu, rho); return getSiteMuFull(tmp.x, tmp.y, tmp.z, tmp.t, s.mu); } template - __host__ __device__ inline static gSiteStack site_move(const gSiteStack &s, const int mu, const int nu, const int rho) { + __device__ __host__ inline static gSiteStack site_move(const gSiteStack &s, const int mu, const int nu, const int rho) { sitexyzt tmp = site_move(s.coordFull, mu, nu, rho); return getSiteStackFull(tmp.x, tmp.y, tmp.z, tmp.t, s.stack); } template - __host__ __device__ inline static sitexyzt site_move(const sitexyzt &s, const int mu, const int nu, const int rho) { + __device__ __host__ inline static sitexyzt site_move(const sitexyzt &s, const int mu, const int nu, const int rho) { int x = s.x; int y = s.y; int z = s.z; @@ -1304,22 +1298,22 @@ class GIndexer { /// ------------------------------------------------------------------------------------- site_move: FOUR DIRECTIONS template - __host__ __device__ inline static gSite site_move(const gSite &s, const int mu, const int nu, const int rho, const int sig) { + __device__ __host__ inline static gSite site_move(const gSite &s, const int mu, const int nu, const int rho, const int sig) { sitexyzt tmp = site_move(s.coordFull, mu, nu, rho, sig); return getSiteFull(tmp.x, tmp.y, tmp.z, tmp.t); } template - __host__ __device__ inline static gSiteMu site_move(const gSiteMu &s, const int mu, const int nu, const int rho, const int sig) { + __device__ __host__ inline static gSiteMu site_move(const gSiteMu &s, const int mu, const int nu, const int rho, const int sig) { sitexyzt tmp = site_move(s.coordFull, mu, nu, rho, sig); return getSiteMuFull(tmp.x, tmp.y, tmp.z, tmp.t, s.mu); } template - __host__ __device__ inline static gSiteStack site_move(const gSiteStack &s, const int mu, const int nu, const int rho, const int sig) { + __device__ __host__ inline static gSiteStack site_move(const gSiteStack &s, const int mu, const int nu, const int rho, const int sig) { sitexyzt tmp = site_move(s.coordFull, mu, nu, rho, sig); return getSiteStackFull(tmp.x, tmp.y, tmp.z, tmp.t, s.stack); } template - __host__ __device__ inline static sitexyzt site_move(const sitexyzt &s, const int mu, const int nu, const int rho, const int sig) { + __device__ __host__ inline static sitexyzt site_move(const sitexyzt &s, const int mu, const int nu, const int rho, const int sig) { int x = s.x; int y = s.y; int z = s.z; @@ -1553,55 +1547,55 @@ class GIndexer { } /// ------------------------------------------------------------------------------------------------ site_up and site_dn - template __host__ __device__ inline static T site_up(const T &s, const int mu) { + template __device__ __host__ inline static T site_up(const T &s, const int mu) { return site_move<1>(s, mu); } - template __host__ __device__ inline static T site_dn(const T &s, const int mu) { + template __device__ __host__ inline static T site_dn(const T &s, const int mu) { return site_move<-1>(s, mu); } - template __host__ __device__ inline static T site_up_up(const T &s, const int mu, const int nu) { + template __device__ __host__ inline static T site_up_up(const T &s, const int mu, const int nu) { #ifdef __GPU_ARCH__ return site_move<1, 1>(s, mu, nu); #else return site_up(site_up(s, mu), nu); #endif } - template __host__ __device__ inline static T site_up_dn(const T &s, const int mu, const int nu) { + template __device__ __host__ inline static T site_up_dn(const T &s, const int mu, const int nu) { #ifdef __GPU_ARCH__ return site_move<1, -1>(s, mu, nu); #else return site_dn(site_up(s, mu), nu); #endif } - template __host__ __device__ inline static T site_dn_dn(const T &s, const int mu, const int nu) { + template __device__ __host__ inline static T site_dn_dn(const T &s, const int mu, const int nu) { #ifdef __GPU_ARCH__ return site_move<-1, -1>(s, mu, nu); #else return site_dn(site_dn(s, mu), nu); #endif } - template __host__ __device__ inline static T site_up_up_up(const T &s, const int mu, const int nu, const int rho) { + template __device__ __host__ inline static T site_up_up_up(const T &s, const int mu, const int nu, const int rho) { #ifdef __GPU_ARCH__ return site_move<1, 1, 1>(s, mu, nu, rho); #else return site_up(site_up_up(s, mu, nu), rho); #endif } - template __host__ __device__ inline static T site_up_up_dn(const T &s, const int mu, const int nu, const int rho) { + template __device__ __host__ inline static T site_up_up_dn(const T &s, const int mu, const int nu, const int rho) { #ifdef __GPU_ARCH__ return site_move<1, 1, -1>(s, mu, nu, rho); #else return site_dn(site_up_up(s, mu, nu), rho); #endif } - template __host__ __device__ inline static T site_up_dn_dn(const T &s, const int mu, const int nu, const int rho) { + template __device__ __host__ inline static T site_up_dn_dn(const T &s, const int mu, const int nu, const int rho) { #ifdef __GPU_ARCH__ return site_move<1, -1, -1>(s, mu, nu, rho); #else return site_dn(site_up_dn(s, mu, nu), rho); #endif } - template __host__ __device__ inline static T site_dn_dn_dn(const T &s, const int mu, const int nu, const int rho) { + template __device__ __host__ inline static T site_dn_dn_dn(const T &s, const int mu, const int nu, const int rho) { #ifdef __GPU_ARCH__ return site_move<-1, -1, -1>(s, mu, nu, rho); #else @@ -1609,70 +1603,70 @@ class GIndexer { #endif } //! The following are currently unused but can be commented in if needed: - template __host__ __device__ inline static T site_up_up_up_up(const T &s, const int mu, const int nu, const int rho, const int sig) { + template __device__ __host__ inline static T site_up_up_up_up(const T &s, const int mu, const int nu, const int rho, const int sig) { #ifdef __GPU_ARCH__ return site_move<1, 1, 1, 1>(s, mu, nu, rho, sig); #else return site_up(site_up_up_up(s, mu, nu, rho), sig); #endif } - template __host__ __device__ inline static T site_up_up_up_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { + template __device__ __host__ inline static T site_up_up_up_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { #ifdef __GPU_ARCH__ return site_move<1, 1, 1, -1>(s, mu, nu, rho, sig); #else return site_dn(site_up_up_up(s, mu, nu, rho), sig); #endif } - template __host__ __device__ inline static T site_up_up_dn_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { + template __device__ __host__ inline static T site_up_up_dn_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { #ifdef __GPU_ARCH__ return site_move<1, 1, -1, -1>(s, mu, nu, rho, sig); #else return site_dn(site_up_up_dn(s, mu, nu, rho), sig); #endif } - template __host__ __device__ inline static T site_up_dn_dn_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { + template __device__ __host__ inline static T site_up_dn_dn_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { #ifdef __GPU_ARCH__ return site_move<1, -1, -1, -1>(s, mu, nu, rho, sig); #else return site_dn(site_up_dn_dn(s, mu, nu, rho), sig); #endif } - template __host__ __device__ inline static T site_dn_dn_dn_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { + template __device__ __host__ inline static T site_dn_dn_dn_dn(const T &s, const int mu, const int nu, const int rho, const int sig) { #ifdef __GPU_ARCH__ return site_move<-1, -1, -1, -1>(s, mu, nu, rho, sig); #else return site_dn(site_dn_dn_dn(s, mu, nu, rho), sig); #endif } - template __host__ __device__ inline static T site_2up_up(const T &s, const int mu, const int nu) { + template __device__ __host__ inline static T site_2up_up(const T &s, const int mu, const int nu) { #ifdef __GPU_ARCH__ return site_move<2, 1>(s, mu, nu); #else return site_up_up_up(s, mu, mu, nu); #endif } - template __host__ __device__ inline static T site_2up_dn(const T &s, const int mu, const int nu) { + template __device__ __host__ inline static T site_2up_dn(const T &s, const int mu, const int nu) { #ifdef __GPU_ARCH__ return site_move<2, -1>(s, mu, nu); #else return site_up_up_dn(s, mu, mu, nu); #endif } - template __host__ __device__ inline static T site_up_2dn(const T &s, const int mu, const int nu) { + template __device__ __host__ inline static T site_up_2dn(const T &s, const int mu, const int nu) { #ifdef __GPU_ARCH__ return site_move<1, -2>(s, mu, nu); #else return site_up_dn_dn(s, mu, mu, nu); #endif } - template __host__ __device__ inline static T site_2up(const T &s, const int mu) { + template __device__ __host__ inline static T site_2up(const T &s, const int mu) { #ifdef __GPU_ARCH__ return site_move<2>(s, mu); #else return site_up_up(s, mu, mu); #endif } - template __host__ __device__ inline static T site_2dn(const T &s, const int mu) { + template __device__ __host__ inline static T site_2dn(const T &s, const int mu) { #ifdef __GPU_ARCH__ return site_move<-2>(s, mu); #else @@ -1684,7 +1678,7 @@ class GIndexer { //! Unlike the above implementation of site_move, this can be used in a for loop. Presumably it is slower? //! Currently unused but can be commented in if needed: - __host__ __device__ inline static sitexyzt dynamic_move(sitexyzt s, const int mu, int mu_steps) { + __device__ __host__ inline static sitexyzt dynamic_move(sitexyzt s, const int mu, int mu_steps) { int x = s.x; int y = s.y; int z = s.z; @@ -1747,7 +1741,7 @@ class GIndexer { } return sitexyzt(x, y, z, t); } - __attribute__((unused)) __host__ __device__ inline static gSite dynamic_move(const gSite &s, const int mu, int mu_steps) { + __attribute__((unused)) __device__ __host__ inline static gSite dynamic_move(const gSite &s, const int mu, int mu_steps) { sitexyzt tmp = dynamic_move(s.coordFull, mu, mu_steps); return getSiteFull(tmp.x, tmp.y, tmp.z, tmp.t); } diff --git a/src/base/indexer/HaloIndexer.h b/src/base/indexer/HaloIndexer.h index d746e7a0..47b1057c 100644 --- a/src/base/indexer/HaloIndexer.h +++ b/src/base/indexer/HaloIndexer.h @@ -8,7 +8,6 @@ #ifndef HALOINDEXER_H #define HALOINDEXER_H - #include "BulkIndexer.h" #include #include @@ -99,9 +98,9 @@ struct HaloData { size_t h_offsetsHalf[80]; - __host__ __device__ HaloData() {} + __device__ __host__ HaloData() {} - __host__ __device__ HaloData(size_t lx, size_t ly, size_t lz, size_t lt, size_t halo_depth, unsigned int Nodes[4]) { + __device__ __host__ HaloData(size_t lx, size_t ly, size_t lz, size_t lt, size_t halo_depth, unsigned int Nodes[4]) { h_HaloDepth[0] = Nodes[0] != 1 ? halo_depth : 0; @@ -208,7 +207,7 @@ struct HaloData { } - __host__ __device__ size_t getBufferSize(Layout LatLayout) { + __device__ __host__ size_t getBufferSize(Layout LatLayout) { if (LatLayout == All)return h_summed_buffer[15]; else return h_summed_bufferHalf[15]; } @@ -218,7 +217,7 @@ struct HaloData { /// This function returns the size of these sub_Halos. /// The argument is the number of the Sub-Halo! - __host__ __device__ inline size_t get_SubHaloSize(const short number, Layout LatLayout) const { + __device__ __host__ inline size_t get_SubHaloSize(const short number, Layout LatLayout) const { size_t EvenFactor = 1; if (LatLayout != All) EvenFactor = 2; @@ -245,7 +244,7 @@ struct HaloData { private: /// The argument is the number of the Halo Type! It returns the size of an All Halo Type! - __host__ __device__ inline size_t get_SubHaloSizeFromType(const short number) const { + __device__ __host__ inline size_t get_SubHaloSizeFromType(const short number) const { if (number == 0) return h_YZTH; if (number == 1) return h_XZTH; if (number == 2) return h_XYTH; @@ -286,7 +285,7 @@ class HaloIndexer { private: - __host__ __device__ inline static size_t _getHaloNumber(size_t index, size_t *LocHalIndex) { + __device__ __host__ inline static size_t _getHaloNumber(size_t index, size_t *LocHalIndex) { if (LatLayout == All) { for (int i = 1; i < 80; i++) { if (getHalData().h_offsets[i] > index) { @@ -314,7 +313,7 @@ class HaloIndexer { return 0; }; - __host__ __device__ inline static size_t _getHaloNumberReduced(size_t index, size_t *LocHalIndex) { + __device__ __host__ inline static size_t _getHaloNumberReduced(size_t index, size_t *LocHalIndex) { if (LatLayout == All) { for (int i = 1; i < 80; i++) { if (getHalDataReduced().h_offsets[i] > index) { @@ -343,11 +342,11 @@ class HaloIndexer { }; public: - __host__ __device__ HaloIndexer(); + __device__ __host__ HaloIndexer(); - __host__ __device__ ~HaloIndexer() {}; + __device__ __host__ ~HaloIndexer() {}; - __host__ __device__ inline static HaloData getHalData() { + __device__ __host__ inline static HaloData getHalData() { #if defined(__GPU_ARCH__) return globHalDataGPU[HaloDepth]; #else @@ -355,7 +354,7 @@ class HaloIndexer { #endif } - __host__ __device__ inline static HaloData getHalDataReduced() { + __device__ __host__ inline static HaloData getHalDataReduced() { #if defined(__GPU_ARCH__) return globHalDataGPUReduced[HaloDepth]; #else @@ -363,31 +362,31 @@ class HaloIndexer { #endif } - __host__ __device__ inline static size_t getBufferSize() { + __device__ __host__ inline static size_t getBufferSize() { if (LatLayout == All)return getHalData().h_summed_buffer[15]; else return getHalData().h_summed_bufferHalf[15]; } - __host__ __device__ inline static size_t get_SubHaloOffset(const short number) { + __device__ __host__ inline static size_t get_SubHaloOffset(const short number) { if (LatLayout == All)return getHalData().h_offsets[number]; else return getHalData().h_offsetsHalf[number]; } - __host__ __device__ inline static size_t get_SubHaloSize(const short number) { + __device__ __host__ inline static size_t get_SubHaloSize(const short number) { return getHalData().get_SubHaloSize(number, LatLayout); } - __host__ __device__ inline static size_t get_ReducedSubHaloSize(const short number) { + __device__ __host__ inline static size_t get_ReducedSubHaloSize(const short number) { return getHalDataReduced().get_SubHaloSize(number, LatLayout); } - __host__ __device__ inline static void getCoord_eo(size_t &x, size_t &y, size_t &z, size_t &t, + __device__ __host__ inline static void getCoord_eo(size_t &x, size_t &y, size_t &z, size_t &t, const size_t index, const size_t vol1, const size_t vol2, const size_t vol3, const bool par) { @@ -407,7 +406,7 @@ class HaloIndexer { ++x; } - __host__ __device__ inline static void getCoord(size_t &x, size_t &y, size_t &z, size_t &t, + __device__ __host__ inline static void getCoord(size_t &x, size_t &y, size_t &z, size_t &t, const size_t index, const size_t vol1, const size_t vol2, const size_t vol3) { @@ -426,20 +425,20 @@ class HaloIndexer { } - __host__ __device__ inline static void + __device__ __host__ inline static void getHypPlanePos(size_t number, size_t &pos_a, size_t &pos_b) { pos_a = number * 2; pos_b = number * 2 + 1; } - __host__ __device__ inline static void + __device__ __host__ inline static void getPlanePos(size_t number, size_t dir, size_t &pos_a, size_t &pos_b) { number -= 4; pos_a = 8 + number * 4 + dir; pos_b = 8 + number * 4 + dir + (3 - 2 * dir); } - __host__ __device__ inline static void + __device__ __host__ inline static void getStripePos(size_t number, size_t dir, size_t &pos_a, size_t &pos_b) { number -= 10; @@ -447,7 +446,7 @@ class HaloIndexer { pos_b = 32 + number * 8 + dir + (7 - 2 * dir); } - __host__ __device__ inline static void + __device__ __host__ inline static void getCornerPos(size_t number, size_t dir, size_t &pos_a, size_t &pos_b) { number -= 14; @@ -456,7 +455,7 @@ class HaloIndexer { } - __host__ __device__ inline static HaloSegment mapIntToHSeg(int bits) { + __device__ __host__ inline static HaloSegment mapIntToHSeg(int bits) { if (bits == 1) return X; if (bits == 2) return Y; if (bits == 4) return Z; @@ -480,7 +479,7 @@ class HaloIndexer { return X; } - __host__ __device__ inline static HaloSegment getHSeg(sitexyzt coord) { + __device__ __host__ inline static HaloSegment getHSeg(sitexyzt coord) { int bits = 0; @@ -499,7 +498,7 @@ class HaloIndexer { return mapIntToHSeg(bits); } - __host__ __device__ inline static short getlr(sitexyzt coord) { + __device__ __host__ inline static short getlr(sitexyzt coord) { short lr = 0; HaloSegment hseg = getHSeg(coord); @@ -560,15 +559,15 @@ class HaloIndexer { } - __host__ __device__ inline static size_t getOuterHaloSize() { + __device__ __host__ inline static size_t getOuterHaloSize() { return getHalData().getBufferSize(LatLayout); } - __host__ __device__ inline static size_t getInnerHaloSize() { + __device__ __host__ inline static size_t getInnerHaloSize() { return getHalDataReduced().getBufferSize(LatLayout); } - __host__ __device__ inline static size_t getCenterSize() { + __device__ __host__ inline static size_t getCenterSize() { return GIndexer::getLatData().vol4 - getInnerHaloSize(); } @@ -590,7 +589,7 @@ class HaloIndexer { /// |______________| /// - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerHaloCoord(size_t HalIndex, size_t &HalNumber, size_t &LocHalIndex) { @@ -685,7 +684,7 @@ class HaloIndexer { /// |______________| /// - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getOuterHaloCoord(size_t HalIndex, size_t &HalNumber, size_t &LocHalIndex) { HalNumber = _getHaloNumber(HalIndex, &LocHalIndex); @@ -782,7 +781,7 @@ class HaloIndexer { /// However if one does that by templating it, the compiler is not smart enough to optimize it away, /// so that this indexer become slower... - __host__ __device__ inline static sitexyzt getInnerCoord(size_t HalIndex) { + __device__ __host__ inline static sitexyzt getInnerCoord(size_t HalIndex) { size_t HalNumber = 0, LocHalIndex = 0; HalNumber = _getHaloNumberReduced(HalIndex, &LocHalIndex); @@ -878,7 +877,7 @@ class HaloIndexer { /// +++++++++++++++++++++++++++ 8 HYPERPLANES +++++++++++++++++++++++++++++++ /// lr = 0,1 - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerHaloCoord_Hyperplane_X(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -892,7 +891,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerHaloCoord_Hyperplane_Y(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -906,7 +905,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerHaloCoord_Hyperplane_Z(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -920,7 +919,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerHaloCoord_Hyperplane_T(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -936,7 +935,7 @@ class HaloIndexer { /// +++++++++++++++++++++++++++++ 24 PLANES +++++++++++++++++++++++++++++++++ /// lr = 0-3 - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerHaloCoord_Plane_XY(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(z, t, x, y, LocHalIndex, getHalData().h_LZi, getHalData().h_ZT, getHalData().h_ZTH); @@ -951,7 +950,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerHaloCoord_Plane_XZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, t, x, z, LocHalIndex, getHalData().h_LYi, getHalData().h_YT, getHalData().h_YTH); @@ -965,7 +964,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerHaloCoord_Plane_XT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, z, x, t, LocHalIndex, getHalData().h_LYi, getHalData().h_YZ, getHalData().h_YZH); @@ -979,7 +978,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerHaloCoord_Plane_YZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, t, y, z, LocHalIndex, getHalData().h_LXi, getHalData().h_XT, getHalData().h_XTH); @@ -993,7 +992,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerHaloCoord_Plane_YT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, z, y, t, LocHalIndex, getHalData().h_LXi, getHalData().h_XZ, getHalData().h_XZH); @@ -1007,7 +1006,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerHaloCoord_Plane_ZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, y, z, t, LocHalIndex, getHalData().h_LXi, getHalData().h_XY, getHalData().h_XYH); @@ -1023,7 +1022,7 @@ class HaloIndexer { /// ++++++++++++++++++++++++++++ 32 STRIPES +++++++++++++++++++++++++++++++++ /// lr = 0-7 - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerHaloCoord_Stripe_XYZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(t, x, y, z, LocHalIndex, getHalData().h_LTi, getHalData().h_TH, getHalData().h_THH); @@ -1038,7 +1037,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerHaloCoord_Stripe_XYT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(z, x, y, t, LocHalIndex, getHalData().h_LZi, getHalData().h_ZH, getHalData().h_ZHH); @@ -1053,7 +1052,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerHaloCoord_Stripe_XZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, x, z, t, LocHalIndex, getHalData().h_LYi, getHalData().h_YH, getHalData().h_YHH); @@ -1068,7 +1067,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerHaloCoord_Stripe_YZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, y, z, t, LocHalIndex, getHalData().h_LXi, getHalData().h_XH, getHalData().h_XHH); @@ -1085,7 +1084,7 @@ class HaloIndexer { /// ++++++++++++++++++++++++++++ 16 CORNERS +++++++++++++++++++++++++++++++++ /// lr = 0-15 - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerHaloCoord_Corner(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1124,7 +1123,7 @@ class HaloIndexer { /// +++++++++++++++++++++++++++ 8 HYPERPLANES +++++++++++++++++++++++++++++++ /// lr = 0,1 - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getOuterHaloCoord_Hyperplane_X(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1137,7 +1136,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getOuterHaloCoord_Hyperplane_Y(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1150,7 +1149,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getOuterHaloCoord_Hyperplane_Z(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1163,7 +1162,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getOuterHaloCoord_Hyperplane_T(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1178,7 +1177,7 @@ class HaloIndexer { /// +++++++++++++++++++++++++++++ 24 PLANES +++++++++++++++++++++++++++++++++ /// lr = 0-3 - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getOuterHaloCoord_Plane_XY(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(z, t, x, y, LocHalIndex, getHalData().h_LZi, getHalData().h_ZT, getHalData().h_ZTH); @@ -1191,7 +1190,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getOuterHaloCoord_Plane_XZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, t, x, z, LocHalIndex, getHalData().h_LYi, getHalData().h_YT, getHalData().h_YTH); @@ -1203,7 +1202,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getOuterHaloCoord_Plane_XT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, z, x, t, LocHalIndex, getHalData().h_LYi, getHalData().h_YZ, getHalData().h_YZH); @@ -1215,7 +1214,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getOuterHaloCoord_Plane_YZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, t, y, z, LocHalIndex, getHalData().h_LXi, getHalData().h_XT, getHalData().h_XTH); @@ -1227,7 +1226,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getOuterHaloCoord_Plane_YT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, z, y, t, LocHalIndex, getHalData().h_LXi, getHalData().h_XZ, getHalData().h_XZH); @@ -1239,7 +1238,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getOuterHaloCoord_Plane_ZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, y, z, t, LocHalIndex, getHalData().h_LXi, getHalData().h_XY, getHalData().h_XYH); @@ -1253,7 +1252,7 @@ class HaloIndexer { /// ++++++++++++++++++++++++++++ 32 STRIPES +++++++++++++++++++++++++++++++++ /// lr = 0-7 - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getOuterHaloCoord_Stripe_XYZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(t, x, y, z, LocHalIndex, getHalData().h_LTi, getHalData().h_TH, getHalData().h_THH); @@ -1265,7 +1264,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getOuterHaloCoord_Stripe_XYT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(z, x, y, t, LocHalIndex, getHalData().h_LZi, getHalData().h_ZH, getHalData().h_ZHH); @@ -1277,7 +1276,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getOuterHaloCoord_Stripe_XZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, x, z, t, LocHalIndex, getHalData().h_LYi, getHalData().h_YH, getHalData().h_YHH); @@ -1289,7 +1288,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getOuterHaloCoord_Stripe_YZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, y, z, t, LocHalIndex, getHalData().h_LXi, getHalData().h_XH, getHalData().h_XHH); @@ -1303,7 +1302,7 @@ class HaloIndexer { /// ++++++++++++++++++++++++++++ 16 CORNERS +++++++++++++++++++++++++++++++++ /// lr = 0-15 - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getOuterHaloCoord_Corner(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1340,7 +1339,7 @@ class HaloIndexer { /// +++++++++++++++++++++++++++ 8 HYPERPLANES +++++++++++++++++++++++++++++++ /// lr = 0,1 - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerCoord_Hyperplane_X(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1354,7 +1353,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerCoord_Hyperplane_Y(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1368,7 +1367,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerCoord_Hyperplane_Z(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1382,7 +1381,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerCoord_Hyperplane_T(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1398,7 +1397,7 @@ class HaloIndexer { /// +++++++++++++++++++++++++++++ 24 PLANES +++++++++++++++++++++++++++++++++ /// lr = 0-3 - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerCoord_Plane_XY(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(z, t, x, y, LocHalIndex, getHalDataReduced().h_LZi, getHalDataReduced().h_ZT, @@ -1411,7 +1410,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerCoord_Plane_XZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, t, x, z, LocHalIndex, getHalDataReduced().h_LYi, getHalDataReduced().h_YT, @@ -1424,7 +1423,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerCoord_Plane_XT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, z, x, t, LocHalIndex, getHalDataReduced().h_LYi, getHalDataReduced().h_YZ, @@ -1437,7 +1436,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerCoord_Plane_YZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, t, y, z, LocHalIndex, getHalDataReduced().h_LXi, getHalDataReduced().h_XT, @@ -1450,7 +1449,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerCoord_Plane_YT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, z, y, t, LocHalIndex, getHalDataReduced().h_LXi, getHalDataReduced().h_XZ, @@ -1463,7 +1462,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerCoord_Plane_ZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, y, z, t, LocHalIndex, getHalDataReduced().h_LXi, getHalDataReduced().h_XY, @@ -1478,7 +1477,7 @@ class HaloIndexer { /// ++++++++++++++++++++++++++++ 32 STRIPES +++++++++++++++++++++++++++++++++ /// lr = 0-7 - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerCoord_Stripe_XYZ(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(t, x, y, z, LocHalIndex, getHalDataReduced().h_LTi, getHalDataReduced().h_TH, @@ -1491,7 +1490,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerCoord_Stripe_XYT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(z, x, y, t, LocHalIndex, getHalDataReduced().h_LZi, getHalDataReduced().h_ZH, @@ -1504,7 +1503,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerCoord_Stripe_XZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(y, x, z, t, LocHalIndex, getHalDataReduced().h_LYi, getHalDataReduced().h_YH, @@ -1517,7 +1516,7 @@ class HaloIndexer { return sitexyzt(x, y, z, t); } - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerCoord_Stripe_YZT(size_t LocHalIndex, short lr) { size_t x, y, z, t; getCoord(x, y, z, t, LocHalIndex, getHalDataReduced().h_LXi, getHalDataReduced().h_XH, @@ -1532,7 +1531,7 @@ class HaloIndexer { /// ++++++++++++++++++++++++++++ 16 CORNERS +++++++++++++++++++++++++++++++++ /// lr = 0-15 - __host__ __device__ inline static sitexyzt + __device__ __host__ inline static sitexyzt getInnerCoord_Corner(size_t LocHalIndex, short lr) { size_t x, y, z, t; @@ -1566,7 +1565,7 @@ class HaloIndexer { /// - __host__ __device__ inline static sitexyzt getCenterCoord(size_t CenterIndex) { + __device__ __host__ inline static sitexyzt getCenterCoord(size_t CenterIndex) { size_t x = 0, y = 0, z = 0, t = 0; diff --git a/src/base/math/correlators.h b/src/base/math/correlators.h index 08574434..3d854056 100644 --- a/src/base/math/correlators.h +++ b/src/base/math/correlators.h @@ -499,7 +499,7 @@ struct SpacetimePairKernelSymm : CorrelatorTools { SpacetimePairKernelSymm(LatticeContainerAccessor field1, LatticeContainerAccessor field2, LatticeContainerAccessor field1Xfield2, size_t dindex) : _field1(field1), _field2(field2), _field1Xfield2(field1Xfield2), _dindex(dindex), CorrelatorTools() {} - __host__ __device__ void operator()(gSite site) { + __device__ __host__ void operator()(gSite site) { typedef GIndexer GInd; size_t m, n; fieldType field1m, field2n; @@ -568,7 +568,7 @@ struct SpacetimePairKernel : CorrelatorTools { SpacetimePairKernel(LatticeContainerAccessor field1, LatticeContainerAccessor field2, LatticeContainerAccessor field1Xfield2, size_t dindex) : _field1(field1), _field2(field2), _field1Xfield2(field1Xfield2), _dindex(dindex), CorrelatorTools() {} - __host__ __device__ void operator()(gSite site) { + __device__ __host__ void operator()(gSite site) { typedef GIndexer GInd; size_t m, n; fieldType field1m, field2n; @@ -670,7 +670,7 @@ struct SpatialPairKernelSymm : CorrelatorTools { SpatialPairKernelSymm(LatticeContainerAccessor field1, LatticeContainerAccessor field2, LatticeContainerAccessor field1Xfield2, size_t dindex) : _field1(field1), _field2(field2), _field1Xfield2(field1Xfield2), _dindex(dindex), CorrelatorTools() {} - __host__ __device__ void operator()(gSite site) { + __device__ __host__ void operator()(gSite site) { typedef GIndexer GInd; size_t m, n; fieldType field1m, field2n; @@ -722,7 +722,7 @@ struct SpatialPairKernel : CorrelatorTools { SpatialPairKernel(LatticeContainerAccessor field1, LatticeContainerAccessor field2, LatticeContainerAccessor field1Xfield2, size_t dindex) : _field1(field1), _field2(field2), _field1Xfield2(field1Xfield2), _dindex(dindex), CorrelatorTools() {} - __host__ __device__ void operator()(gSite site) { + __device__ __host__ void operator()(gSite site) { typedef GIndexer GInd; size_t m, n; fieldType field1m, field2n; @@ -797,7 +797,7 @@ struct RestrictedOffAxisKernel : CorrelatorTools { /// direction. (Backward correlations will be counted from the forward correlation of some other m.) A possible /// displacement is (1,0,0); therefore some on-axis correlations are computed already in the off-axis kernel. This /// is taken into account in the on-axis kernel. - __host__ __device__ void operator()(size_t dindex) { /// dindex indexes displacement vector + __device__ __host__ void operator()(size_t dindex) { /// dindex indexes displacement vector typedef GIndexer GInd; size_t m,n1,n2,n3,n4; @@ -855,7 +855,7 @@ struct RestrictedOnAxisKernel : CorrelatorTools { : _field1(field1), _field2(field2), _field1Xfield2off(field1Xfield2off), _field1Xfield2on(field1Xfield2on), CorrelatorTools() {} - __host__ __device__ void operator()(size_t dx){ /// Now dx corresponds to a separation, rather than a displacement + __device__ __host__ void operator()(size_t dx){ /// Now dx corresponds to a separation, rather than a displacement typedef GIndexer GInd; size_t m,n1,n2,n3; diff --git a/src/base/math/gcomplex.h b/src/base/math/gcomplex.h index 53571d34..f6173dac 100644 --- a/src/base/math/gcomplex.h +++ b/src/base/math/gcomplex.h @@ -52,7 +52,6 @@ class GPUcomplex { floatT2 c; #define cREAL c.x #define cIMAG c.y - /** * Default constructor, leave values uninitialized. */ @@ -539,7 +538,7 @@ GPUcomplex::invalid() { } template -__host__ __device__ inline bool +__device__ __host__ inline bool compareGCOMPLEX(GPUcomplex a, GPUcomplex b, floatT tol) { floatT diffRe = abs(real(a) - real(b)); floatT diffIm = abs(imag(a) - imag(b)); diff --git a/src/base/math/grnd.h b/src/base/math/grnd.h index 79d0b69e..adf19c88 100644 --- a/src/base/math/grnd.h +++ b/src/base/math/grnd.h @@ -9,7 +9,6 @@ #include #endif - #include "../../define.h" #include "../gutils.h" #include "../IO/misc.h" @@ -39,14 +38,14 @@ template<> __host__ __device__ inline double minVal(){ return DBL_MIN; } /** * internal functions, should only be called from get_rand! */ -__host__ __device__ inline unsigned taus_step( unsigned &z, int S1, int S2, int S3, unsigned M) +__device__ __host__ inline unsigned taus_step( unsigned &z, int S1, int S2, int S3, unsigned M) { unsigned b=((((z<>S2); return z=((((z &M)< -__host__ __device__ inline floatT get_rand(uint4* state) +__device__ __host__ inline floatT get_rand(uint4* state) { return 2.3283064365386963e-10*( taus_step( state->x, 13, 19, 12, 4294967294ul)^ taus_step( state->y, 2, 25, 4, 4294967288ul)^ @@ -64,7 +63,7 @@ __host__ __device__ inline floatT get_rand(uint4* state) /// A random variable in (0,1]. template -__host__ __device__ inline floatT get_rand_excl0(uint4* state) +__device__ __host__ inline floatT get_rand_excl0(uint4* state) { floatT xR = get_rand(state); return xR + (1.0-xR)*minVal(); diff --git a/src/base/math/gsu2.h b/src/base/math/gsu2.h index 33ff017b..9494fa59 100644 --- a/src/base/math/gsu2.h +++ b/src/base/math/gsu2.h @@ -16,40 +16,40 @@ template class GSU2 { public: - __host__ __device__ GSU2() { }; + __device__ __host__ GSU2() { }; GCOMPLEX(floatT) _e11,_e12; - __host__ __device__ GSU2(GCOMPLEX(floatT) e11, GCOMPLEX(floatT) e12) : _e11(e11), _e12(e12) {} + __device__ __host__ GSU2(GCOMPLEX(floatT) e11, GCOMPLEX(floatT) e12) : _e11(e11), _e12(e12) {} - __host__ __device__ friend GSU2 operator+(const GSU2 &x,const GSU2 &y) { + __device__ __host__ friend GSU2 operator+(const GSU2 &x,const GSU2 &y) { return GSU2 (x._e11+y._e11,x._e12+y._e12); } - __host__ __device__ friend GSU2 operator-(const GSU2 &x,const GSU2 &y) { + __device__ __host__ friend GSU2 operator-(const GSU2 &x,const GSU2 &y) { return GSU2 (x._e11-y._e11,x._e12-y._e12); } - __host__ __device__ friend GSU2 operator*(const GSU2 &x,const GCOMPLEX(floatT) &y) { + __device__ __host__ friend GSU2 operator*(const GSU2 &x,const GCOMPLEX(floatT) &y) { return GSU2 (x._e11*y,x._e12*y); } - __host__ __device__ friend GSU2 operator*(const GCOMPLEX(floatT) &x,const GSU2 &y) { + __device__ __host__ friend GSU2 operator*(const GCOMPLEX(floatT) &x,const GSU2 &y) { return GSU2 (x*y._e11,x*y._e12); } - __host__ __device__ friend GSU2 operator*(const GSU2 &x,const floatT &y) { + __device__ __host__ friend GSU2 operator*(const GSU2 &x,const floatT &y) { return GSU2 (x._e11*y,x._e12*y); } - __host__ __device__ friend GSU2 operator*(const floatT &x,const GSU2 &y) { + __device__ __host__ friend GSU2 operator*(const floatT &x,const GSU2 &y) { return GSU2 (x*y._e11,x*y._e12); } - __host__ __device__ friend GSU2 operator/(const GSU2 &x,const floatT &y) { + __device__ __host__ friend GSU2 operator/(const GSU2 &x,const floatT &y) { return GSU2 (x._e11/y,x._e12/y); } - __host__ __device__ friend GSU2 operator*(const GSU2 &x,const GSU2 &y) { + __device__ __host__ friend GSU2 operator*(const GSU2 &x,const GSU2 &y) { GCOMPLEX(floatT) tmp1,tmp2; tmp1=y._e12; tmp2=y._e11; @@ -58,48 +58,48 @@ class GSU2 { return GSU2 (tmp1,tmp2); } - __host__ __device__ GSU2 &operator =(const GSU2 &y) { + __device__ __host__ GSU2 &operator =(const GSU2 &y) { _e11=y._e11; _e12=y._e12; return *this; } - __host__ __device__ GSU2 &operator+=(const GSU2 &y) { + __device__ __host__ GSU2 &operator+=(const GSU2 &y) { _e11+=y._e11; _e12+=y._e12; return *this; } - __host__ __device__ GSU2 &operator-=(const GSU2 &y) { + __device__ __host__ GSU2 &operator-=(const GSU2 &y) { _e11-=y._e11; _e12-=y._e12; return *this; } - __host__ __device__ GSU2 &operator*=(const GSU2 &y) { + __device__ __host__ GSU2 &operator*=(const GSU2 &y) { *this=*this*y; return *this; } - __host__ __device__ GSU2 &operator*=(const GCOMPLEX(floatT) &y) { + __device__ __host__ GSU2 &operator*=(const GCOMPLEX(floatT) &y) { _e11*=y; _e12*=y; return *this; } - __host__ __device__ GSU2 &operator*=(const floatT &y) { + __device__ __host__ GSU2 &operator*=(const floatT &y) { *this=*this*y; return *this; } - __host__ __device__ GSU2 &operator/=(const floatT &y) { + __device__ __host__ GSU2 &operator/=(const floatT &y) { *this=*this/y; return *this; } - __host__ __device__ floatT tr2() { + __device__ __host__ floatT tr2() { return( real(_e11) ); } - __host__ __device__ GCOMPLEX(floatT) det() { + __device__ __host__ GCOMPLEX(floatT) det() { return( real(_e11) ); } - __host__ __device__ void unitarize() { + __device__ __host__ void unitarize() { floatT res; res = real(_e11)*real(_e11) + imag(_e11)*imag(_e11) + @@ -110,7 +110,7 @@ class GSU2 { _e12=_e12*res; } - __host__ __device__ GSU2 dagger() const { + __device__ __host__ GSU2 dagger() const { GSU2 tmp; tmp._e11 = conj(_e11); @@ -119,7 +119,7 @@ class GSU2 { return tmp; } - __host__ __device__ floatT norm2() const { + __device__ __host__ floatT norm2() const { return (real(_e11)*real(_e11) + real(_e12)*real(_e12) + imag(_e11)*imag(_e11) + imag(_e12)*imag(_e12)); } @@ -133,7 +133,7 @@ class GSU2 { }; template -__host__ __device__ inline GSU2 dagger(const GSU2 &x) { +__device__ __host__ inline GSU2 dagger(const GSU2 &x) { GSU2 tmp; tmp._e11 = conj(x._e11); tmp._e12 = - x._e12; @@ -141,13 +141,13 @@ __host__ __device__ inline GSU2 dagger(const GSU2 &x) { } template -__host__ __device__ inline floatT norm2(const GSU2 &x) { +__device__ __host__ inline floatT norm2(const GSU2 &x) { return ( real(x._e11)*real(x._e11) + real(x._e12)*real(x._e12) + imag(x._e11)*imag(x._e11) + imag(x._e12)*imag(x._e12) ); } template -__host__ __device__ inline GSU2 sub12 (const GSU3 &u, const GSU3 &v) { +__device__ __host__ inline GSU2 sub12 (const GSU3 &u, const GSU3 &v) { GCOMPLEX(floatT) r00,r01,r10,r11; r00 = u.getLink00()*v.getLink00() + u.getLink01()*v.getLink10() + u.getLink02()*v.getLink20(); @@ -159,7 +159,7 @@ __host__ __device__ inline GSU2 sub12 (const GSU3 &u, const GSU } template -__host__ __device__ inline GSU2 sub13(const GSU3 &u, const GSU3 &v) { +__device__ __host__ inline GSU2 sub13(const GSU3 &u, const GSU3 &v) { GCOMPLEX(floatT) r00,r01,r10,r11; r00 = u.getLink00()*v.getLink00() + u.getLink01()*v.getLink10() + u.getLink02()*v.getLink20(); @@ -171,7 +171,7 @@ __host__ __device__ inline GSU2 sub13(const GSU3 &u, const GSU3< } template -__host__ __device__ inline GSU2 sub23(const GSU3 &u, const GSU3 &v) { +__device__ __host__ inline GSU2 sub23(const GSU3 &u, const GSU3 &v) { GCOMPLEX(floatT) r00,r01,r10,r11; r00 = u.getLink10()*v.getLink01() + u.getLink11()*v.getLink11() + u.getLink12()*v.getLink21(); @@ -183,7 +183,7 @@ __host__ __device__ inline GSU2 sub23(const GSU3 &u, const GSU3< } template -__host__ __device__ inline GSU3 sub12(const GSU2 &u, +__device__ __host__ inline GSU3 sub12(const GSU2 &u, const GSU3 &v) { return GSU3 (u._e11 *v.getLink00() + u._e12 *v.getLink10(), u._e11 *v.getLink01() + u._e12 *v.getLink11(), @@ -197,7 +197,7 @@ __host__ __device__ inline GSU3 sub12(const GSU2 &u, } template -__host__ __device__ inline GSU3 sub13(const GSU2 &u, const GSU3 &v) { +__device__ __host__ inline GSU3 sub13(const GSU2 &u, const GSU3 &v) { return GSU3 (u._e11 *v.getLink00() + u._e12 *v.getLink20(), u._e11 *v.getLink01() + u._e12 *v.getLink21(), u._e11 *v.getLink02() + u._e12 *v.getLink22(), @@ -210,7 +210,7 @@ __host__ __device__ inline GSU3 sub13(const GSU2 &u, const GSU3< } template -__host__ __device__ inline GSU3 sub23(const GSU2 &u, const GSU3 &v) { +__device__ __host__ inline GSU3 sub23(const GSU2 &u, const GSU3 &v) { return GSU3 (v.getLink00(), v.getLink01(), v.getLink02(), @@ -223,7 +223,7 @@ __host__ __device__ inline GSU3 sub23(const GSU2 &u, const GSU3< } template -__host__ __device__ inline floatT realtrace(const GSU3 &x) { +__device__ __host__ inline floatT realtrace(const GSU3 &x) { return ( real(x.getLink00() + x.getLink11() + x.getLink22()) ); } diff --git a/src/base/math/gsu3.h b/src/base/math/gsu3.h index 7be8fa71..7cc7f071 100644 --- a/src/base/math/gsu3.h +++ b/src/base/math/gsu3.h @@ -8,7 +8,6 @@ #ifndef _gsu3_h_ #define _gsu3_h_ - #include "../../define.h" #include "gcomplex.h" #include "gvect3.h" @@ -30,67 +29,67 @@ template __host__ std::istream &operator>>(std::istream &, GSU3 &); template -__host__ __device__ inline GSU3 operator+(const GSU3 &, const GSU3 &); +__device__ __host__ inline GSU3 operator+(const GSU3 &, const GSU3 &); template -__host__ __device__ inline GSU3 operator-(const GSU3 &, const GSU3 &); +__device__ __host__ inline GSU3 operator-(const GSU3 &, const GSU3 &); template -__host__ __device__ inline GSU3 operator*(const GCOMPLEX(floatT) &, const GSU3 &); +__device__ __host__ inline GSU3 operator*(const GCOMPLEX(floatT) &, const GSU3 &); template -__host__ __device__ inline GSU3 operator*(const GSU3 &, const GCOMPLEX(floatT) &); +__device__ __host__ inline GSU3 operator*(const GSU3 &, const GCOMPLEX(floatT) &); template -__host__ __device__ inline GSU3 operator*(const floatT &, const GSU3 &); +__device__ __host__ inline GSU3 operator*(const floatT &, const GSU3 &); template -__host__ __device__ inline GSU3 operator*(const GSU3 &, const floatT &); +__device__ __host__ inline GSU3 operator*(const GSU3 &, const floatT &); template -__host__ __device__ inline GSU3 operator*(const GSU3 &, const GSU3 &); +__device__ __host__ inline GSU3 operator*(const GSU3 &, const GSU3 &); template -__host__ __device__ inline GSU3 operator/(const GSU3 &, const floatT &); +__device__ __host__ inline GSU3 operator/(const GSU3 &, const floatT &); template -__host__ __device__ floatT tr_d(const GSU3 &); +__device__ __host__ floatT tr_d(const GSU3 &); template -__host__ __device__ floatT tr_i(const GSU3 &); +__device__ __host__ floatT tr_i(const GSU3 &); template -__host__ __device__ floatT tr_d(const GSU3 &, const GSU3 &); +__device__ __host__ floatT tr_d(const GSU3 &, const GSU3 &); template -__host__ __device__ GCOMPLEX(floatT) tr_c(const GSU3 &); +__device__ __host__ GCOMPLEX(floatT) tr_c(const GSU3 &); template -__host__ __device__ GCOMPLEX(floatT) tr_c(const GSU3 &, const GSU3 &); +__device__ __host__ GCOMPLEX(floatT) tr_c(const GSU3 &, const GSU3 &); template -__host__ __device__ GSU3 dagger(const GSU3 &); +__device__ __host__ GSU3 dagger(const GSU3 &); template -__host__ __device__ GCOMPLEX(floatT) det(const GSU3 &X); +__device__ __host__ GCOMPLEX(floatT) det(const GSU3 &X); template -__host__ __device__ floatT realdet(const GSU3 &X); +__device__ __host__ floatT realdet(const GSU3 &X); template -__host__ __device__ floatT infnorm(const GSU3 &X); +__device__ __host__ floatT infnorm(const GSU3 &X); template -__host__ __device__ GSU3 su3_exp(GSU3); +__device__ __host__ GSU3 su3_exp(GSU3); template -__host__ __device__ gVect3 operator*(const GSU3 &, const gVect3 &); +__device__ __host__ gVect3 operator*(const GSU3 &, const gVect3 &); template -__host__ __device__ GSU3 tensor_prod(const gVect3 &, const gVect3 &); +__device__ __host__ GSU3 tensor_prod(const gVect3 &, const gVect3 &); template -__host__ __device__ inline bool compareGSU3(GSU3 a, GSU3 b, floatT tol=1e-13); +__device__ __host__ inline bool compareGSU3(GSU3 a, GSU3 b, floatT tol=1e-13); template class GSU3 { @@ -133,41 +132,41 @@ class GSU3 { // matrix operations - __host__ __device__ friend GSU3 operator+<>(const GSU3 &, const GSU3 &); + __device__ __host__ friend GSU3 operator+<>(const GSU3 &, const GSU3 &); - __host__ __device__ friend GSU3 operator-<>(const GSU3 &, const GSU3 &); + __device__ __host__ friend GSU3 operator-<>(const GSU3 &, const GSU3 &); - __host__ __device__ friend GSU3 operator*<>(const GCOMPLEX(floatT) &x, const GSU3 &y); + __device__ __host__ friend GSU3 operator*<>(const GCOMPLEX(floatT) &x, const GSU3 &y); - __host__ __device__ friend GSU3 operator*<>(const GSU3 &x, const GCOMPLEX(floatT) &y); + __device__ __host__ friend GSU3 operator*<>(const GSU3 &x, const GCOMPLEX(floatT) &y); - __host__ __device__ friend GSU3 operator*<>(const floatT &x, const GSU3 &y); + __device__ __host__ friend GSU3 operator*<>(const floatT &x, const GSU3 &y); - __host__ __device__ friend GSU3 operator*<>(const GSU3 &x, const floatT &y); + __device__ __host__ friend GSU3 operator*<>(const GSU3 &x, const floatT &y); - __host__ __device__ friend GSU3 operator*<>(const GSU3 &, const GSU3 &); + __device__ __host__ friend GSU3 operator*<>(const GSU3 &, const GSU3 &); - __host__ __device__ friend GSU3 operator/<>(const GSU3 &x, const floatT &y); + __device__ __host__ friend GSU3 operator/<>(const GSU3 &x, const floatT &y); - __host__ __device__ bool operator==(const GSU3 &); + __device__ __host__ bool operator==(const GSU3 &); - __host__ __device__ GSU3 &operator=(const GSU3 &); + __device__ __host__ GSU3 &operator=(const GSU3 &); - __host__ __device__ GSU3 &operator+=(const GSU3 &); + __device__ __host__ GSU3 &operator+=(const GSU3 &); - __host__ __device__ GSU3 &operator-=(const GSU3 &); + __device__ __host__ GSU3 &operator-=(const GSU3 &); - __host__ __device__ GSU3 &operator*=(const floatT &); + __device__ __host__ GSU3 &operator*=(const floatT &); - __host__ __device__ GSU3 &operator*=(const GCOMPLEX(floatT) &); + __device__ __host__ GSU3 &operator*=(const GCOMPLEX(floatT) &); - __host__ __device__ GSU3 &operator*=(const GSU3 &); + __device__ __host__ GSU3 &operator*=(const GSU3 &); - __host__ __device__ GSU3 &operator/=(const floatT &); + __device__ __host__ GSU3 &operator/=(const floatT &); // cast operations single <-> double precision template - __host__ __device__ inline operator GSU3() const { + __device__ __host__ inline operator GSU3() const { return GSU3(GCOMPLEX(T)(_e00.cREAL, _e00.cIMAG), GCOMPLEX(T)(_e01.cREAL, _e01.cIMAG), GCOMPLEX(T)(_e02.cREAL, _e02.cIMAG), GCOMPLEX(T)(_e10.cREAL, _e10.cIMAG), GCOMPLEX(T)(_e11.cREAL, _e11.cIMAG), @@ -177,18 +176,18 @@ class GSU3 { } - __host__ __device__ friend gVect3 + __device__ __host__ friend gVect3 operator*<>(const GSU3 &, const gVect3 &); // GSU3 * cvect3 multiplication - __host__ __device__ friend GSU3 + __device__ __host__ friend GSU3 tensor_prod<>(const gVect3 &, const gVect3 &); // tensor product of two cvect3 - __host__ __device__ friend bool + __device__ __host__ friend bool compareGSU3<>(GSU3 a, GSU3 b, floatT tol); - __host__ __device__ void random(uint4 *state); // set links randomly - __host__ __device__ void gauss(uint4 *state); // set links gauss - __host__ __device__ void su3unitarize(); // project to su3 using first two rows of link - __host__ __device__ void su3reconstruct12() // project to su3 using first two rows of link + __device__ __host__ void random(uint4 *state); // set links randomly + __device__ __host__ void gauss(uint4 *state); // set links gauss + __device__ __host__ void su3unitarize(); // project to su3 using first two rows of link + __device__ __host__ void su3reconstruct12() // project to su3 using first two rows of link { _e20 = GCOMPLEX(floatT)((_e01.cREAL * _e12.cREAL - _e01.cIMAG * _e12.cIMAG - (_e02.cREAL * _e11.cREAL - _e02.cIMAG * _e11.cIMAG)), @@ -206,7 +205,7 @@ class GSU3 { + (_e01.cIMAG * _e10.cREAL + _e01.cREAL * _e10.cIMAG))); } - __host__ __device__ void su3reconstruct12Dagger() // project to su3 using first two rows of link + __device__ __host__ void su3reconstruct12Dagger() // project to su3 using first two rows of link { _e02 = GCOMPLEX(floatT)((_e10.cREAL * _e21.cREAL - _e10.cIMAG * _e21.cIMAG - (_e20.cREAL * _e11.cREAL - _e20.cIMAG * _e11.cIMAG)), @@ -224,7 +223,7 @@ class GSU3 { + (_e10.cIMAG * _e01.cREAL + _e10.cREAL * _e01.cIMAG))); } - __host__ __device__ void u3reconstruct(const GCOMPLEX(floatT) phase) // project to u3 using first two rows of link + __device__ __host__ void u3reconstruct(const GCOMPLEX(floatT) phase) // project to u3 using first two rows of link { _e20 = GCOMPLEX(floatT)((_e01.cREAL * _e12.cREAL - _e01.cIMAG * _e12.cIMAG @@ -254,7 +253,7 @@ class GSU3 { _e22 *= phase; } - __host__ __device__ void u3reconstructDagger(const GCOMPLEX(floatT) phase) // project to u3 using first two rows of link + __device__ __host__ void u3reconstructDagger(const GCOMPLEX(floatT) phase) // project to u3 using first two rows of link { _e02 = GCOMPLEX(floatT)((_e10.cREAL * _e21.cREAL - _e10.cIMAG * _e21.cIMAG @@ -282,7 +281,7 @@ class GSU3 { _e22 *= phase; } - __host__ __device__ void reconstruct14(const GCOMPLEX(floatT) det) + __device__ __host__ void reconstruct14(const GCOMPLEX(floatT) det) { floatT amp = pow(abs(det), 1.0/3.0); GCOMPLEX(floatT) phase = det / abs(det); @@ -307,7 +306,7 @@ class GSU3 { _e22 *= phase/amp; } - __host__ __device__ void reconstruct14Dagger(const GCOMPLEX(floatT) det) + __device__ __host__ void reconstruct14Dagger(const GCOMPLEX(floatT) det) { floatT amp = pow(abs(det), 1.0/3.0); @@ -331,19 +330,19 @@ class GSU3 { _e12 *= phase/amp; _e22 *= phase/amp; } - __host__ __device__ void TA(); // traceless anti-hermitian of link - __host__ __device__ friend floatT tr_d<>(const GSU3 &); // real part of trace of link - __host__ __device__ friend floatT tr_i<>(const GSU3 &); // imaginary part of trace of link - __host__ __device__ friend floatT + __device__ __host__ void TA(); // traceless anti-hermitian of link + __device__ __host__ friend floatT tr_d<>(const GSU3 &); // real part of trace of link + __device__ __host__ friend floatT tr_i<>(const GSU3 &); // imaginary part of trace of link + __device__ __host__ friend floatT tr_d<>(const GSU3 &, const GSU3 &); // real part of trace of link*link - __host__ __device__ friend GCOMPLEX(floatT) tr_c<>(const GSU3 &); // trace of link - __host__ __device__ friend GCOMPLEX(floatT) tr_c<>(const GSU3 &, + __device__ __host__ friend GCOMPLEX(floatT) tr_c<>(const GSU3 &); // trace of link + __device__ __host__ friend GCOMPLEX(floatT) tr_c<>(const GSU3 &, const GSU3 &); // trace of link*link - __host__ __device__ friend GSU3 + __device__ __host__ friend GSU3 dagger<>(const GSU3 &); // hermitian conjugate - __host__ __device__ friend GSU3 su3_exp<>(GSU3); // exp( link ) - __host__ __device__ friend GCOMPLEX(floatT) det<>(const GSU3 &); - __host__ __device__ friend floatT infnorm<>(const GSU3 &); + __device__ __host__ friend GSU3 su3_exp<>(GSU3); // exp( link ) + __device__ __host__ friend GCOMPLEX(floatT) det<>(const GSU3 &); + __device__ __host__ friend floatT infnorm<>(const GSU3 &); // accessors __host__ __device__ inline GCOMPLEX(floatT) getLink00() const; @@ -521,7 +520,7 @@ __host__ __device__ inline void GSU3::setLink22(GCOMPLEX(floatT) x) { // some constant su3 matrices template -__host__ __device__ inline GSU3 gsu3_one() { +__device__ __host__ inline GSU3 gsu3_one() { return GSU3(1, 0, 0, 0, 1, 0, 0, 0, 1); @@ -529,7 +528,7 @@ __host__ __device__ inline GSU3 gsu3_one() { #if ! defined(USE_HIP_AMD) && ! defined(USE_CPU_ONLY) template <> -__host__ __device__ inline GSU3<__half> gsu3_one() { +__device__ __host__ inline GSU3<__half> gsu3_one() { GPUcomplex<__half> g_one(__float2half(1.0)); GPUcomplex<__half> g_zero(__float2half(0.0)); @@ -540,63 +539,63 @@ __host__ __device__ inline GSU3<__half> gsu3_one() { #endif template -__host__ __device__ inline GSU3 gsu3_zero() { +__device__ __host__ inline GSU3 gsu3_zero() { return GSU3(0, 0, 0, 0, 0, 0, 0, 0, 0); } template -__host__ __device__ inline GSU3 glambda_1() { +__device__ __host__ inline GSU3 glambda_1() { return GSU3(0, 1, 0, 1, 0, 0, 0, 0, 0); } template -__host__ __device__ inline GSU3 glambda_2() { +__device__ __host__ inline GSU3 glambda_2() { return GSU3(0 , -GCOMPLEX(floatT)(0, 1), 0, GCOMPLEX(floatT)(0, 1), 0 , 0, 0 , 0 , 0); } template -__host__ __device__ inline GSU3 glambda_3() { +__device__ __host__ inline GSU3 glambda_3() { return GSU3(1, 0 , 0, 0, -1, 0, 0, 0 , 0); } template -__host__ __device__ inline GSU3 glambda_4() { +__device__ __host__ inline GSU3 glambda_4() { return GSU3(0, 0, 1, 0, 0, 0, 1, 0, 0); } template -__host__ __device__ inline GSU3 glambda_5() { +__device__ __host__ inline GSU3 glambda_5() { return GSU3(0 , 0, -GCOMPLEX(floatT)(0, 1), 0 , 0, 0, GCOMPLEX(floatT)(0, 1), 0, 0); } template -__host__ __device__ inline GSU3 glambda_6() { +__device__ __host__ inline GSU3 glambda_6() { return GSU3(0, 0, 0, 0, 0, 1, 0, 1, 0); } template -__host__ __device__ inline GSU3 glambda_7() { +__device__ __host__ inline GSU3 glambda_7() { return GSU3(0, 0 , 0, 0, 0 , -GCOMPLEX(floatT)(0, 1), 0, GCOMPLEX(floatT)(0, 1), 0); } template -__host__ __device__ inline GSU3 glambda_8() { +__device__ __host__ inline GSU3 glambda_8() { return GSU3(1 / sqrt(3), 0 , 0, 0 , 1 / sqrt(3), 0, 0 , 0 , -2 / sqrt(3)); @@ -607,7 +606,7 @@ __host__ __device__ inline GSU3 glambda_8() { // matrix operations template -__host__ __device__ GSU3 operator+(const GSU3 &x, const GSU3 &y) { +__device__ __host__ GSU3 operator+(const GSU3 &x, const GSU3 &y) { return GSU3( x._e00 + y._e00, x._e01 + y._e01, x._e02 + y._e02, x._e10 + y._e10, x._e11 + y._e11, x._e12 + y._e12, @@ -615,7 +614,7 @@ __host__ __device__ GSU3 operator+(const GSU3 &x, const GSU3 -__host__ __device__ GSU3 operator-(const GSU3 &x, const GSU3 &y) { +__device__ __host__ GSU3 operator-(const GSU3 &x, const GSU3 &y) { return GSU3( x._e00 - y._e00, x._e01 - y._e01, x._e02 - y._e02, x._e10 - y._e10, x._e11 - y._e11, x._e12 - y._e12, @@ -624,7 +623,7 @@ __host__ __device__ GSU3 operator-(const GSU3 &x, const GSU3 -__host__ __device__ GSU3 operator*(const GCOMPLEX(floatT) &x, const GSU3 &y) { +__device__ __host__ GSU3 operator*(const GCOMPLEX(floatT) &x, const GSU3 &y) { return GSU3( x * y._e00, x * y._e01, x * y._e02, x * y._e10, x * y._e11, x * y._e12, @@ -632,7 +631,7 @@ __host__ __device__ GSU3 operator*(const GCOMPLEX(floatT) &x, const GSU3 } template -__host__ __device__ GSU3 operator*(const GSU3 &x, const GCOMPLEX(floatT) &y) { +__device__ __host__ GSU3 operator*(const GSU3 &x, const GCOMPLEX(floatT) &y) { return GSU3( x._e00 * y, x._e01 * y, x._e02 * y, x._e10 * y, x._e11 * y, x._e12 * y, @@ -640,7 +639,7 @@ __host__ __device__ GSU3 operator*(const GSU3 &x, const GCOMPLEX } template -__host__ __device__ GSU3 operator*(const floatT &x, const GSU3 &y) { +__device__ __host__ GSU3 operator*(const floatT &x, const GSU3 &y) { return GSU3( x * y._e00, x * y._e01, x * y._e02, x * y._e10, x * y._e11, x * y._e12, @@ -648,7 +647,7 @@ __host__ __device__ GSU3 operator*(const floatT &x, const GSU3 & } template -__host__ __device__ GSU3 operator*(const GSU3 &x, const floatT &y) { +__device__ __host__ GSU3 operator*(const GSU3 &x, const floatT &y) { return GSU3( x._e00 * y, x._e01 * y, x._e02 * y, x._e10 * y, x._e11 * y, x._e12 * y, @@ -656,7 +655,7 @@ __host__ __device__ GSU3 operator*(const GSU3 &x, const floatT & } template -__host__ __device__ GSU3 operator/(const GSU3 &x, const floatT &y) { +__device__ __host__ GSU3 operator/(const GSU3 &x, const floatT &y) { return GSU3( x._e00 / y, x._e01 / y, x._e02 / y, x._e10 / y, x._e11 / y, x._e12 / y, @@ -665,7 +664,7 @@ __host__ __device__ GSU3 operator/(const GSU3 &x, const floatT & template -__host__ __device__ GSU3 operator*(const GSU3 &x, const GSU3 &y) { +__device__ __host__ GSU3 operator*(const GSU3 &x, const GSU3 &y) { GCOMPLEX(floatT) tmp00, tmp01, tmp02, tmp10, tmp11, tmp12, tmp20, tmp21, tmp22; @@ -688,7 +687,7 @@ __host__ __device__ GSU3 operator*(const GSU3 &x, const GSU3 -__host__ __device__ gVect3 operator*(const GSU3 &x, const gVect3 &y) { +__device__ __host__ gVect3 operator*(const GSU3 &x, const gVect3 &y) { GCOMPLEX(floatT) tmp0, tmp1, tmp2; tmp0 = x._e00 * y._v0 + x._e01 * y._v1 + x._e02 * y._v2; @@ -700,7 +699,7 @@ __host__ __device__ gVect3 operator*(const GSU3 &x, const gVect3 template -__host__ __device__ inline GSU3 &GSU3::operator=(const GSU3 &y) { +__device__ __host__ inline GSU3 &GSU3::operator=(const GSU3 &y) { _e00 = y._e00; _e01 = y._e01; _e02 = y._e02; @@ -714,7 +713,7 @@ __host__ __device__ inline GSU3 &GSU3::operator=(const GSU3 -__host__ __device__ GSU3 &GSU3::operator+=(const GSU3 &y) { +__device__ __host__ GSU3 &GSU3::operator+=(const GSU3 &y) { _e00 += y._e00; _e01 += y._e01; _e02 += y._e02; @@ -728,7 +727,7 @@ __host__ __device__ GSU3 &GSU3::operator+=(const GSU3 &y } template -__host__ __device__ GSU3 &GSU3::operator-=(const GSU3 &y) { +__device__ __host__ GSU3 &GSU3::operator-=(const GSU3 &y) { _e00 -= y._e00; _e01 -= y._e01; _e02 -= y._e02; @@ -742,13 +741,13 @@ __host__ __device__ GSU3 &GSU3::operator-=(const GSU3 &y } template -__host__ __device__ GSU3 &GSU3::operator*=(const floatT &y) { +__device__ __host__ GSU3 &GSU3::operator*=(const floatT &y) { *this = *this * y; return *this; } template -__host__ __device__ GSU3 &GSU3::operator*=(const GCOMPLEX(floatT) &y) { +__device__ __host__ GSU3 &GSU3::operator*=(const GCOMPLEX(floatT) &y) { _e00 *= y; _e01 *= y; _e02 *= y; @@ -762,13 +761,13 @@ __host__ __device__ GSU3 &GSU3::operator*=(const GCOMPLEX(floatT } template -__host__ __device__ GSU3 &GSU3::operator*=(const GSU3 &y) { +__device__ __host__ GSU3 &GSU3::operator*=(const GSU3 &y) { *this = *this * y; return *this; } template -__host__ __device__ GSU3 &GSU3::operator/=(const floatT &y) { +__device__ __host__ GSU3 &GSU3::operator/=(const floatT &y) { *this = *this / y; return *this; } @@ -777,7 +776,7 @@ __host__ __device__ GSU3 &GSU3::operator/=(const floatT &y) { /// tolerance for comparison. In that case please look to the compareGSU3 method. In case you are comparing with the /// zero matrix, you should use compareGSU3, as the present method seems not to work for that case. template -__host__ __device__ bool GSU3::operator==(const GSU3 &y) { +__device__ __host__ bool GSU3::operator==(const GSU3 &y) { if (_e00 == y._e00 && _e01 == y._e01 && _e02 == y._e02 && @@ -808,7 +807,7 @@ __host__ inline std::istream &operator>>(std::istream &s, GSU3 &x) { template -__host__ __device__ void GSU3::random(uint4 *state) { +__device__ __host__ void GSU3::random(uint4 *state) { GCOMPLEX(floatT) rnd; @@ -837,7 +836,7 @@ __host__ __device__ void GSU3::random(uint4 *state) { template -__host__ __device__ void GSU3::gauss(uint4 *state) { +__device__ __host__ void GSU3::gauss(uint4 *state) { #ifndef USE_CPU_ONLY if constexpr (!std::is_same::value) { #endif @@ -903,7 +902,7 @@ __host__ __device__ void GSU3::gauss(uint4 *state) { // project to su3 using first two rows of link template -__host__ __device__ void GSU3::su3unitarize() { +__device__ __host__ void GSU3::su3unitarize() { #ifndef USE_CPU_ONLY if constexpr (!std::is_same::value) { #endif @@ -1034,7 +1033,7 @@ __host__ __device__ void GSU3::su3unitarize() { } template -__host__ __device__ GCOMPLEX(floatT) det(const GSU3 &x) { +__device__ __host__ GCOMPLEX(floatT) det(const GSU3 &x) { GCOMPLEX(floatT) res; @@ -1046,12 +1045,12 @@ __host__ __device__ GCOMPLEX(floatT) det(const GSU3 &x) { } template -__host__ __device__ floatT realdet(const GSU3 &x) { +__device__ __host__ floatT realdet(const GSU3 &x) { return det(x).cREAL; } template -__host__ __device__ floatT infnorm(const GSU3 &x) { +__device__ __host__ floatT infnorm(const GSU3 &x) { floatT res = x._e00.cREAL * x._e00.cREAL; res = x._e00.cIMAG * x._e00.cIMAG + res; res = x._e01.cREAL * x._e01.cREAL + res; @@ -1084,7 +1083,7 @@ __host__ __device__ floatT infnorm(const GSU3 &x) { // traceless anti-hermitian of link template -__host__ __device__ void GSU3::TA() { +__device__ __host__ void GSU3::TA() { GSU3 tmp; tmp._e00 = GCOMPLEX(floatT)(0, 0.6666666666666666 * _e00.cIMAG - 0.3333333333333333 * (_e11.cIMAG + _e22.cIMAG)); @@ -1102,19 +1101,19 @@ __host__ __device__ void GSU3::TA() { // real part of trace of link template -__host__ __device__ floatT tr_d(const GSU3 &x) { +__device__ __host__ floatT tr_d(const GSU3 &x) { return floatT(x._e00.cREAL + x._e11.cREAL + x._e22.cREAL); } // imaginary part of trace of link template -__host__ __device__ floatT tr_i(const GSU3 &x) { +__device__ __host__ floatT tr_i(const GSU3 &x) { return floatT(x._e00.cIMAG + x._e11.cIMAG + x._e22.cIMAG); } // real part of trace of link*link template -__host__ __device__ floatT tr_d(const GSU3 &x, const GSU3 &y) { +__device__ __host__ floatT tr_d(const GSU3 &x, const GSU3 &y) { floatT res; res = (x._e00 * y._e00).cREAL + (x._e01 * y._e10).cREAL + (x._e02 * y._e20).cREAL + (x._e10 * y._e01).cREAL + (x._e11 * y._e11).cREAL + (x._e12 * y._e21).cREAL @@ -1125,13 +1124,13 @@ __host__ __device__ floatT tr_d(const GSU3 &x, const GSU3 &y) { // trace of link template -__host__ __device__ GCOMPLEX(floatT) tr_c(const GSU3 &x) { +__device__ __host__ GCOMPLEX(floatT) tr_c(const GSU3 &x) { return GCOMPLEX(floatT)(x._e00 + x._e11 + x._e22); } // trace of link*link template -__host__ __device__ GCOMPLEX(floatT) tr_c(const GSU3 &x, const GSU3 &y) { +__device__ __host__ GCOMPLEX(floatT) tr_c(const GSU3 &x, const GSU3 &y) { GCOMPLEX(floatT) res; @@ -1144,7 +1143,7 @@ __host__ __device__ GCOMPLEX(floatT) tr_c(const GSU3 &x, const GSU3 -__host__ __device__ GSU3 dagger(const GSU3 &x) { +__device__ __host__ GSU3 dagger(const GSU3 &x) { GSU3 tmp; tmp._e00 = conj(x._e00); @@ -1162,7 +1161,7 @@ __host__ __device__ GSU3 dagger(const GSU3 &x) { // exp( link ) template -__host__ __device__ GSU3 su3_exp(GSU3 u) { +__device__ __host__ GSU3 su3_exp(GSU3 u) { GSU3 res; res = gsu3_one() @@ -1177,7 +1176,7 @@ __host__ __device__ GSU3 su3_exp(GSU3 u) { // tensor product of two cvect3 template -__host__ __device__ GSU3 tensor_prod(const gVect3 &x, const gVect3 &y) { +__device__ __host__ GSU3 tensor_prod(const gVect3 &x, const gVect3 &y) { GSU3 res; res._e00 = x._v0 * y._v0; @@ -1194,7 +1193,7 @@ __host__ __device__ GSU3 tensor_prod(const gVect3 &x, const gVec } template -__host__ __device__ inline bool compareGSU3(GSU3 a, GSU3 b, floatT tol) { +__device__ __host__ inline bool compareGSU3(GSU3 a, GSU3 b, floatT tol) { for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) { diff --git a/src/base/math/gvect3.h b/src/base/math/gvect3.h index f859c613..20839568 100644 --- a/src/base/math/gvect3.h +++ b/src/base/math/gvect3.h @@ -21,27 +21,27 @@ template class cVect3; template class gVect3array; template __host__ std::ostream & operator<<(std::ostream &, const gVect3 &); template __host__ std::istream & operator>>(std::istream &, gVect3 &); -template __host__ __device__ GCOMPLEX(floatT) operator*(const gVect3 &,const gVect3 &); -template __host__ __device__ GCOMPLEX(floatT) complex_product(const gVect3 &,const gVect3 &); -template __host__ __device__ GCOMPLEX(floatT) complex_product_add(const gVect3 &,const gVect3 &, const GCOMPLEX(floatT) &); - - -template __host__ __device__ gVect3 operator+(const gVect3 &,const gVect3 &); -template __host__ __device__ gVect3 operator-(const gVect3 &,const gVect3 &); -template __host__ __device__ gVect3 operator*(const floatT &,const gVect3 &); -template __host__ __device__ gVect3 operator*(const GCOMPLEX(floatT) &,const gVect3 &); -template __host__ __device__ gVect3 operator*(const gVect3 &,const floatT &); -template __host__ __device__ gVect3 operator*(const gVect3 &,const GCOMPLEX(floatT) &); -template __host__ __device__ gVect3 conj(const gVect3 &); -template __host__ __device__ floatT norm2(const gVect3 &); -template __host__ __device__ GCOMPLEX(floatT) dot_prod(const gVect3 &,const gVect3 &); -template __host__ __device__ floatT re_dot_prod(const gVect3 &,const gVect3 &); -template __host__ __device__ gVect3 operator*(const GSU3 &,const gVect3 &); -template __host__ __device__ GSU3 tensor_prod(const gVect3 &,const gVect3 &); -template __host__ __device__ inline floatT minVal(); +template __device__ __host__ GCOMPLEX(floatT) operator*(const gVect3 &,const gVect3 &); +template __device__ __host__ GCOMPLEX(floatT) complex_product(const gVect3 &,const gVect3 &); +template __device__ __host__ GCOMPLEX(floatT) complex_product_add(const gVect3 &,const gVect3 &, const GCOMPLEX(floatT) &); + + +template __device__ __host__ gVect3 operator+(const gVect3 &,const gVect3 &); +template __device__ __host__ gVect3 operator-(const gVect3 &,const gVect3 &); +template __device__ __host__ gVect3 operator*(const floatT &,const gVect3 &); +template __device__ __host__ gVect3 operator*(const GCOMPLEX(floatT) &,const gVect3 &); +template __device__ __host__ gVect3 operator*(const gVect3 &,const floatT &); +template __device__ __host__ gVect3 operator*(const gVect3 &,const GCOMPLEX(floatT) &); +template __device__ __host__ gVect3 conj(const gVect3 &); +template __device__ __host__ floatT norm2(const gVect3 &); +template __device__ __host__ GCOMPLEX(floatT) dot_prod(const gVect3 &,const gVect3 &); +template __device__ __host__ floatT re_dot_prod(const gVect3 &,const gVect3 &); +template __device__ __host__ gVect3 operator*(const GSU3 &,const gVect3 &); +template __device__ __host__ GSU3 tensor_prod(const gVect3 &,const gVect3 &); +template __device__ __host__ inline floatT minVal(); template -__host__ __device__ inline floatT get_rand(uint4* state); +__device__ __host__ inline floatT get_rand(uint4* state); template class gVect3 @@ -67,28 +67,28 @@ class gVect3 // vector operations - __host__ __device__ gVect3 &operator =(const gVect3 &); - __host__ __device__ gVect3 &operator-=(const gVect3 &); - __host__ __device__ gVect3 &operator+=(const gVect3 &); - __host__ __device__ gVect3 &operator*=(const floatT &); - __host__ __device__ gVect3 &operator*=(const GCOMPLEX(floatT) &); - __host__ __device__ friend GCOMPLEX(floatT) operator* <> (const gVect3 &,const gVect3 &); - __host__ __device__ friend GCOMPLEX(floatT) complex_product <> (const gVect3 &,const gVect3 &); - __host__ __device__ friend GCOMPLEX(floatT) complex_product_add <> (const gVect3 &,const gVect3 &, const GCOMPLEX(floatT) & ); - __host__ __device__ friend gVect3 operator+ <> (const gVect3 &,const gVect3 &); - __host__ __device__ friend gVect3 operator- <> (const gVect3 &,const gVect3 &); - __host__ __device__ friend gVect3 operator* <> (const floatT &,const gVect3 &); - __host__ __device__ friend gVect3 operator* <> (const GCOMPLEX(floatT) &,const gVect3 &); - __host__ __device__ friend gVect3 operator* <> (const gVect3 &,const floatT &); - __host__ __device__ friend gVect3 operator* <> (const gVect3 &,const GCOMPLEX(floatT) &); - - __host__ __device__ friend gVect3 conj <> (const gVect3 &); // complex conjugate - __host__ __device__ friend floatT norm2 <> (const gVect3 &); // norm2 - __host__ __device__ friend GCOMPLEX(floatT) dot_prod <> (const gVect3&, const gVect3&); // true complex dot product - __host__ __device__ friend floatT re_dot_prod <> (const gVect3 &,const gVect3 &); // real part of dot product + __device__ __host__ gVect3 &operator =(const gVect3 &); + __device__ __host__ gVect3 &operator-=(const gVect3 &); + __device__ __host__ gVect3 &operator+=(const gVect3 &); + __device__ __host__ gVect3 &operator*=(const floatT &); + __device__ __host__ gVect3 &operator*=(const GCOMPLEX(floatT) &); + __device__ __host__ friend GCOMPLEX(floatT) operator* <> (const gVect3 &,const gVect3 &); + __device__ __host__ friend GCOMPLEX(floatT) complex_product <> (const gVect3 &,const gVect3 &); + __device__ __host__ friend GCOMPLEX(floatT) complex_product_add <> (const gVect3 &,const gVect3 &, const GCOMPLEX(floatT) & ); + __device__ __host__ friend gVect3 operator+ <> (const gVect3 &,const gVect3 &); + __device__ __host__ friend gVect3 operator- <> (const gVect3 &,const gVect3 &); + __device__ __host__ friend gVect3 operator* <> (const floatT &,const gVect3 &); + __device__ __host__ friend gVect3 operator* <> (const GCOMPLEX(floatT) &,const gVect3 &); + __device__ __host__ friend gVect3 operator* <> (const gVect3 &,const floatT &); + __device__ __host__ friend gVect3 operator* <> (const gVect3 &,const GCOMPLEX(floatT) &); + + __device__ __host__ friend gVect3 conj <> (const gVect3 &); // complex conjugate + __device__ __host__ friend floatT norm2 <> (const gVect3 &); // norm2 + __device__ __host__ friend GCOMPLEX(floatT) dot_prod <> (const gVect3&, const gVect3&); // true complex dot product + __device__ __host__ friend floatT re_dot_prod <> (const gVect3 &,const gVect3 &); // real part of dot product template - __host__ __device__ void random( rndstateT * const); // set gvect3 randomly - __host__ __device__ void gauss( uint4 * state ) + __device__ __host__ void random( rndstateT * const); // set gvect3 randomly + __device__ __host__ void gauss( uint4 * state ) { #if ! defined(USE_HIP_AMD) && ! defined(USE_CPU_ONLY) if constexpr (!std::is_same::value) { @@ -142,58 +142,58 @@ class gVect3 // cast operations single <-> double precision template - __host__ __device__ operator gVect3 () const { + __device__ __host__ operator gVect3 () const { return gVect3( GCOMPLEX(T)(_v0.cREAL, _v0.cIMAG), GCOMPLEX(T)(_v1.cREAL, _v1.cIMAG), GCOMPLEX(T)(_v2.cREAL, _v2.cIMAG) ); } - __host__ __device__ friend gVect3 operator* <> (const GSU3 &,const gVect3 &); // gsu3 * gvect3 multiplication - __host__ __device__ friend GSU3 tensor_prod <> (const gVect3 &,const gVect3 &); // tensor product of two gvect3 + __device__ __host__ friend gVect3 operator* <> (const GSU3 &,const gVect3 &); // gsu3 * gvect3 multiplication + __device__ __host__ friend GSU3 tensor_prod <> (const gVect3 &,const gVect3 &); // tensor product of two gvect3 - __host__ __device__ inline GCOMPLEX(floatT) getElement0() const { + __device__ __host__ inline GCOMPLEX(floatT) getElement0() const { return _v0; }; - __host__ __device__ inline GCOMPLEX(floatT) getElement1()const { + __device__ __host__ inline GCOMPLEX(floatT) getElement1()const { return _v1; }; - __host__ __device__ inline GCOMPLEX(floatT) getElement2() const { + __device__ __host__ inline GCOMPLEX(floatT) getElement2() const { return _v2; }; - __host__ __device__ inline void addtoElement0(const GCOMPLEX(floatT) a){ + __device__ __host__ inline void addtoElement0(const GCOMPLEX(floatT) a){ _v0 += a; } - __host__ __device__ inline void addtoElement1(const GCOMPLEX(floatT) a){ + __device__ __host__ inline void addtoElement1(const GCOMPLEX(floatT) a){ _v1 += a; } - __host__ __device__ inline void addtoElement2(const GCOMPLEX(floatT) a){ + __device__ __host__ inline void addtoElement2(const GCOMPLEX(floatT) a){ _v2 += a; } - __host__ __device__ inline void setElement0(const GCOMPLEX(floatT)& a){ + __device__ __host__ inline void setElement0(const GCOMPLEX(floatT)& a){ _v0 = a; } - __host__ __device__ inline void setElement1(const GCOMPLEX(floatT)& a){ + __device__ __host__ inline void setElement1(const GCOMPLEX(floatT)& a){ _v1 = a; } - __host__ __device__ inline void setElement2(const GCOMPLEX(floatT)& a){ + __device__ __host__ inline void setElement2(const GCOMPLEX(floatT)& a){ _v2 = a; } - __host__ __device__ inline void subfromElement0(const GCOMPLEX(floatT) a){ + __device__ __host__ inline void subfromElement0(const GCOMPLEX(floatT) a){ _v0 -= a; } - __host__ __device__ inline void subfromElement1(const GCOMPLEX(floatT) a){ + __device__ __host__ inline void subfromElement1(const GCOMPLEX(floatT) a){ _v1 -= a; } - __host__ __device__ inline void subfromElement2(const GCOMPLEX(floatT) a){ + __device__ __host__ inline void subfromElement2(const GCOMPLEX(floatT) a){ _v2 -= a; } - __host__ __device__ inline GCOMPLEX(floatT)& operator() (int i) { + __device__ __host__ inline GCOMPLEX(floatT)& operator() (int i) { switch (i) { case 0: return _v0; @@ -220,7 +220,7 @@ class gVect3 // gvect3 = (1,0,0) or (0,1,0) or (0,0,1) template -__host__ __device__ inline gVect3 gvect3_unity(const int& i) +__device__ __host__ inline gVect3 gvect3_unity(const int& i) { switch ( i ) { @@ -251,7 +251,7 @@ return gVect3<__half> (__float2half(1), __float2half(0), __float2half(0)); #endif // cvect3 = (1,1,1) template -__host__ __device__ inline gVect3 gvect3_one() +__device__ __host__ inline gVect3 gvect3_one() { return gVect3 (1, 1, 1); } @@ -260,7 +260,7 @@ __host__ __device__ inline gVect3 gvect3_one() // cvect3 = (0,0,0) template -__host__ __device__ inline gVect3 gvect3_zero() +__device__ __host__ inline gVect3 gvect3_zero() { return gVect3 (0, 0, 0); } @@ -272,7 +272,7 @@ __device__ inline gVect3<__half> gvect3_zero() } #endif template -__host__ __device__ gVect3 &gVect3::operator=(const gVect3 &y) +__device__ __host__ gVect3 &gVect3::operator=(const gVect3 &y) { _v0 = y._v0; _v1 = y._v1; @@ -281,7 +281,7 @@ __host__ __device__ gVect3 &gVect3::operator=(const gVect3 -__host__ __device__ gVect3 &gVect3::operator-=(const gVect3 &y) +__device__ __host__ gVect3 &gVect3::operator-=(const gVect3 &y) { _v0-= y._v0; _v1-= y._v1; @@ -290,7 +290,7 @@ __host__ __device__ gVect3 &gVect3::operator-=(const gVect3 -__host__ __device__ gVect3 &gVect3::operator+=(const gVect3 &y) +__device__ __host__ gVect3 &gVect3::operator+=(const gVect3 &y) { _v0+= y._v0; _v1+= y._v1; @@ -299,7 +299,7 @@ __host__ __device__ gVect3 &gVect3::operator+=(const gVect3 -__host__ __device__ gVect3 &gVect3::operator*=(const floatT &y) +__device__ __host__ gVect3 &gVect3::operator*=(const floatT &y) { _v0*= y; _v1*= y; @@ -308,7 +308,7 @@ __host__ __device__ gVect3 &gVect3::operator*=(const floatT &y) } template -__host__ __device__ gVect3 &gVect3::operator*=(const GCOMPLEX(floatT) &y) +__device__ __host__ gVect3 &gVect3::operator*=(const GCOMPLEX(floatT) &y) { _v0*= y; _v1*= y; @@ -317,7 +317,7 @@ __host__ __device__ gVect3 &gVect3::operator*=(const GCOMPLEX(fl } template -__host__ __device__ GCOMPLEX(floatT) operator*(const gVect3 &x,const gVect3 &y) +__device__ __host__ GCOMPLEX(floatT) operator*(const gVect3 &x,const gVect3 &y) { GCOMPLEX(floatT) res = conj(x._v0) * y._v0; res += conj(x._v1) * y._v1; @@ -326,7 +326,7 @@ __host__ __device__ GCOMPLEX(floatT) operator*(const gVect3 &x,const gVe } template -__host__ __device__ GCOMPLEX(floatT) complex_product(const gVect3 &x,const gVect3 &y) +__device__ __host__ GCOMPLEX(floatT) complex_product(const gVect3 &x,const gVect3 &y) { // GCOMPLEX(floatT) res = x._v0 *(y._v0); // res += x._v1 * (y._v1); @@ -340,7 +340,7 @@ __host__ __device__ GCOMPLEX(floatT) complex_product(const gVect3 &x,con } template -__host__ __device__ GCOMPLEX(floatT) complex_product_add(const gVect3 &x,const gVect3 &y, const GCOMPLEX(floatT) &d) +__device__ __host__ GCOMPLEX(floatT) complex_product_add(const gVect3 &x,const gVect3 &y, const GCOMPLEX(floatT) &d) { //GCOMPLEX(floatT) res = x._v0 *(y._v0); //res += x._v1 * (y._v1); @@ -352,7 +352,7 @@ __host__ __device__ GCOMPLEX(floatT) complex_product_add(const gVect3 &x } template -__host__ __device__ gVect3 operator+(const gVect3 &x,const gVect3 &y) +__device__ __host__ gVect3 operator+(const gVect3 &x,const gVect3 &y) { gVect3 z; z._v0 = x._v0 + y._v0; @@ -362,7 +362,7 @@ __host__ __device__ gVect3 operator+(const gVect3 &x,const gVect } template -__host__ __device__ gVect3 operator-(const gVect3 &x,const gVect3 &y) +__device__ __host__ gVect3 operator-(const gVect3 &x,const gVect3 &y) { gVect3 z; z._v0 = x._v0 - y._v0; @@ -372,7 +372,7 @@ __host__ __device__ gVect3 operator-(const gVect3 &x,const gVect } template -__host__ __device__ gVect3 operator*(const GCOMPLEX(floatT)& x,const gVect3& y) +__device__ __host__ gVect3 operator*(const GCOMPLEX(floatT)& x,const gVect3& y) { gVect3 z; z._v0 = x * y._v0; @@ -382,7 +382,7 @@ __host__ __device__ gVect3 operator*(const GCOMPLEX(floatT)& x,const gVe } template -__host__ __device__ gVect3 operator*(const floatT & x,const gVect3& y) +__device__ __host__ gVect3 operator*(const floatT & x,const gVect3& y) { gVect3 z; z._v0 = x * y._v0; @@ -392,7 +392,7 @@ __host__ __device__ gVect3 operator*(const floatT & x,const gVect3 -__host__ __device__ gVect3 operator*(const gVect3& x,const GCOMPLEX(floatT)& y) +__device__ __host__ gVect3 operator*(const gVect3& x,const GCOMPLEX(floatT)& y) { gVect3 z; z._v0 = x._v0 * y; @@ -402,7 +402,7 @@ __host__ __device__ gVect3 operator*(const gVect3& x,const GCOMP } template -__host__ __device__ gVect3 operator*(const gVect3& x,const floatT & y) +__device__ __host__ gVect3 operator*(const gVect3& x,const floatT & y) { gVect3 z; z._v0 = x._v0 * y; @@ -413,7 +413,7 @@ __host__ __device__ gVect3 operator*(const gVect3& x,const float //! complex dot product x*y = sum_i(v_i conj(w_i)) template -__host__ __device__ GCOMPLEX(floatT) dot_prod(const gVect3 &x,const gVect3 &y) +__device__ __host__ GCOMPLEX(floatT) dot_prod(const gVect3 &x,const gVect3 &y) { floatT real = x._v0.cREAL*y._v0.cREAL + x._v0.cIMAG*y._v0.cIMAG; real += x._v1.cREAL*y._v1.cREAL + x._v1.cIMAG*y._v1.cIMAG; @@ -426,7 +426,7 @@ __host__ __device__ GCOMPLEX(floatT) dot_prod(const gVect3 &x,const gVec //! real part of dot product (no conjugation for y) template -__host__ __device__ floatT re_dot_prod(const gVect3 &x,const gVect3 &y) +__device__ __host__ floatT re_dot_prod(const gVect3 &x,const gVect3 &y) { floatT res = x._v0.cREAL*y._v0.cREAL + x._v0.cIMAG*y._v0.cIMAG; res += x._v1.cREAL*y._v1.cREAL + x._v1.cIMAG*y._v1.cIMAG; @@ -436,7 +436,7 @@ __host__ __device__ floatT re_dot_prod(const gVect3 &x,const gVect3 -__host__ __device__ floatT norm2(const gVect3 &x) +__device__ __host__ floatT norm2(const gVect3 &x) { floatT res = x._v0.cREAL*x._v0.cREAL + x._v0.cIMAG*x._v0.cIMAG; res += x._v1.cREAL*x._v1.cREAL + x._v1.cIMAG*x._v1.cIMAG; @@ -446,7 +446,7 @@ __host__ __device__ floatT norm2(const gVect3 &x) // complex conjugate template -__host__ __device__ gVect3 conj(const gVect3 &x) +__device__ __host__ gVect3 conj(const gVect3 &x) { gVect3 z; z._v0 = conj(x._v0); diff --git a/src/base/math/matrix4x4.h b/src/base/math/matrix4x4.h index 0235aa60..0bd54213 100644 --- a/src/base/math/matrix4x4.h +++ b/src/base/math/matrix4x4.h @@ -18,14 +18,14 @@ struct Matrix4x4Sym { constexpr Matrix4x4Sym(const Matrix4x4Sym&) = default; - __host__ __device__ Matrix4x4Sym(floatT a) : elems{a, a, a, a, a, a, a, a, a, a} {} - __host__ __device__ Matrix4x4Sym() : elems{0, 0, 0, 0, 0, 0, 0, 0, 0, 0} {} + __device__ __host__ Matrix4x4Sym(floatT a) : elems{a, a, a, a, a, a, a, a, a, a} {} + __device__ __host__ Matrix4x4Sym() : elems{0, 0, 0, 0, 0, 0, 0, 0, 0, 0} {} - __host__ __device__ Matrix4x4Sym(floatT e00, floatT e11, floatT e22, floatT e33, floatT e01, floatT e02, floatT e03, floatT e12, + __device__ __host__ Matrix4x4Sym(floatT e00, floatT e11, floatT e22, floatT e33, floatT e01, floatT e02, floatT e03, floatT e12, floatT e13, floatT e23) : elems{e00, e11, e22, e33, e01, e02, e03, e12, e13, e23} {} - __host__ __device__ inline floatT operator()(int mu, int nu) { + __device__ __host__ inline floatT operator()(int mu, int nu) { if (mu == 0 && nu == 0) return elems[entry::e00]; if (mu == 1 && nu == 1) return elems[entry::e11]; if (mu == 2 && nu == 2) return elems[entry::e22]; @@ -47,7 +47,7 @@ struct Matrix4x4Sym { return 0; } - __host__ __device__ inline void operator()(int mu, int nu, floatT value) { + __device__ __host__ inline void operator()(int mu, int nu, floatT value) { if (mu == 0 && nu == 0) elems[entry::e00] = value; if (mu == 1 && nu == 1) elems[entry::e11] = value; if (mu == 2 && nu == 2) elems[entry::e22] = value; @@ -68,21 +68,21 @@ struct Matrix4x4Sym { if (nu == 2 && mu == 3) elems[entry::e23] = value; } - /* __host__ __device__ inline Matrix4x4Sym& operator=(const floatT &y) + /* __device__ __host__ inline Matrix4x4Sym& operator=(const floatT &y) { for(int i = 0; i<10;i++){ elems[i]=y; } return *this; }*/ - __host__ __device__ inline Matrix4x4Sym& operator=(const Matrix4x4Sym &y) + __device__ __host__ inline Matrix4x4Sym& operator=(const Matrix4x4Sym &y) { for(int i = 0; i<10;i++){ elems[i]=y.elems[i]; } return *this; } - __host__ __device__ inline Matrix4x4Sym& operator+=(const Matrix4x4Sym &y) + __device__ __host__ inline Matrix4x4Sym& operator+=(const Matrix4x4Sym &y) { for(int i = 0; i<10;i++){ @@ -91,7 +91,7 @@ struct Matrix4x4Sym { return *this; } - __host__ __device__ inline Matrix4x4Sym& operator/=(floatT y) + __device__ __host__ inline Matrix4x4Sym& operator/=(floatT y) { for(int i = 0; i<10;i++){ elems[i]/=y; @@ -99,7 +99,7 @@ struct Matrix4x4Sym { return *this; } - __host__ __device__ inline Matrix4x4Sym& operator*=(floatT y) + __device__ __host__ inline Matrix4x4Sym& operator*=(floatT y) { for(int i = 0; i<10;i++){ elems[i]*=y; @@ -111,7 +111,7 @@ struct Matrix4x4Sym { template -__host__ __device__ inline Matrix4x4Sym operator+(const Matrix4x4Sym &x, const Matrix4x4Sym &y) { +__device__ __host__ inline Matrix4x4Sym operator+(const Matrix4x4Sym &x, const Matrix4x4Sym &y) { return Matrix4x4Sym(x.elems[0]+ y.elems[0], x.elems[1]+y.elems[1], x.elems[2]+y.elems[2], x.elems[3]+y.elems[3], x.elems[4]+y.elems[4], x.elems[5]+y.elems[5], x.elems[6]+y.elems[6], x.elems[7]+y.elems[7], x.elems[8]+y.elems[8], x.elems[9]+y.elems[9]); diff --git a/src/base/math/operators.h b/src/base/math/operators.h index bc575f53..4933b094 100644 --- a/src/base/math/operators.h +++ b/src/base/math/operators.h @@ -7,7 +7,6 @@ #include "gvect3.h" #include "../indexer/BulkIndexer.h" - /*! Using the syntax below stuff like this is possible: * Spinor a, b, c, d * Spinor a = b*c + d; diff --git a/src/base/math/simpleArray.h b/src/base/math/simpleArray.h index a347fa3a..38939763 100644 --- a/src/base/math/simpleArray.h +++ b/src/base/math/simpleArray.h @@ -12,18 +12,18 @@ class SimpleArray{ public: - __host__ __device__ T& operator[](size_t i){ + __device__ __host__ T& operator[](size_t i){ return values[i]; } - __host__ __device__ inline auto operator()(gSiteStack site) const + __device__ __host__ inline auto operator()(gSiteStack site) const { return values[site.stack]; } - __host__ __device__ inline auto operator()(gSiteMu site) const + __device__ __host__ inline auto operator()(gSiteMu site) const { return values[site.mu]; } @@ -31,19 +31,19 @@ class SimpleArray{ SimpleArray() = default; - __host__ __device__ SimpleArray(const T& init){ + __device__ __host__ SimpleArray(const T& init){ for(size_t i = 0; i < N; i++){ values[i] = init; } } template - __host__ __device__ SimpleArray(SimpleArray s_array) { + __device__ __host__ SimpleArray(SimpleArray s_array) { for(size_t i = 0; i < N; i++) { values[i] = s_array[i]; } } - __host__ __device__ void operator=(SimpleArray vec){ + __device__ __host__ void operator=(SimpleArray vec){ for(size_t i = 0; i < N; i++){ values[i] = vec[i]; } @@ -56,7 +56,7 @@ class SimpleArray{ } } - __host__ __device__ SimpleArray getAccessor() const { + __device__ __host__ SimpleArray getAccessor() const { return *this; } diff --git a/src/base/math/su3Exp.h b/src/base/math/su3Exp.h index 6b9874fb..651ea77d 100644 --- a/src/base/math/su3Exp.h +++ b/src/base/math/su3Exp.h @@ -34,7 +34,7 @@ N = 25 by default due to an estimated error of order 10^(-26) */ template -__host__ __device__ constexpr unsigned int countOfApproxInverseFak(){ +__device__ __host__ constexpr unsigned int countOfApproxInverseFak(){ unsigned int N = 1; floatT nominator = 1.0; @@ -48,7 +48,7 @@ __host__ __device__ constexpr unsigned int countOfApproxInverseFak(){ // Algorithm from https://luscher.web.cern.ch/luscher/notes/su3fcts.pdf template -__host__ __device__ inline void SU3Exp(const GSU3 inGSU3, GSU3 &outGSU3){ +__device__ __host__ inline void SU3Exp(const GSU3 inGSU3, GSU3 &outGSU3){ constexpr unsigned int N = countOfApproxInverseFak(); floatT c_i[N+1]; diff --git a/src/base/memoryManagement.h b/src/base/memoryManagement.h index 9e4f9293..1c4c564e 100644 --- a/src/base/memoryManagement.h +++ b/src/base/memoryManagement.h @@ -573,7 +573,7 @@ class MemoryAccessor { ~MemoryAccessor() = default; template - __host__ __device__ inline void setValue(const size_t isite, const floatT value) { + __device__ __host__ inline void setValue(const size_t isite, const floatT value) { /// reinterpret_cast is a compile time directive telling the compiler to treat _Array as a floatT*. This is /// needed because _Array is treated as void* right now. auto *arr = reinterpret_cast(Array); @@ -581,7 +581,7 @@ class MemoryAccessor { } template - __host__ __device__ inline void getValue(const size_t isite, floatT &value) { + __device__ __host__ inline void getValue(const size_t isite, floatT &value) { auto *arr = reinterpret_cast(Array); value = arr[isite]; } diff --git a/src/base/runFunctors.h b/src/base/runFunctors.h index 6bdd0395..23edb70b 100644 --- a/src/base/runFunctors.h +++ b/src/base/runFunctors.h @@ -127,6 +127,7 @@ __global__ void performCopyConstObject(Accessor res, Object ob, CalcReadInd calc } #endif + template template void RunFunctors::iterateFunctor(Functor op, CalcReadInd calcReadInd, diff --git a/src/gauge/constructs/gsvd.h b/src/gauge/constructs/gsvd.h index f1070dbf..e5cb2ade 100644 --- a/src/gauge/constructs/gsvd.h +++ b/src/gauge/constructs/gsvd.h @@ -89,7 +89,7 @@ * This routine eliminates off-diagonal element, handling special cases * ************************************************************************/ template -__host__ __device__ inline int svd2x2bidiag(svdfloatT *a00, svdfloatT *a01, svdfloatT *a11, svdfloatT U2[2][2], svdfloatT V2[2][2]) +__device__ __host__ inline int svd2x2bidiag(svdfloatT *a00, svdfloatT *a01, svdfloatT *a11, svdfloatT U2[2][2], svdfloatT V2[2][2]) { register svdfloatT sinphi, cosphi, tanphi, cotphi; register svdfloatT a, b, min, max, abs00, abs01, abs11; @@ -289,7 +289,7 @@ __host__ __device__ inline int svd2x2bidiag(svdfloatT *a00, svdfloatT *a01, svdf template -__host__ __device__ GSU3 svd3x3core(const GSU3& AA, floatT* sv){ +__device__ __host__ GSU3 svd3x3core(const GSU3& AA, floatT* sv){ /****************************************** * sigma[3] -- singular values, * diff --git a/src/gauge/gauge_kernels.cpp b/src/gauge/gauge_kernels.cpp index 2e652ce6..17b3cd88 100644 --- a/src/gauge/gauge_kernels.cpp +++ b/src/gauge/gauge_kernels.cpp @@ -5,7 +5,7 @@ struct plaquetteKernel{ plaquetteKernel(Gaugefield &gauge) : gAcc(gauge.getAccessor()){ } - __host__ __device__ floatT operator()(gSite site) { + __device__ __host__ floatT operator()(gSite site) { typedef GIndexer GInd; GSU3 temp; @@ -29,7 +29,7 @@ struct plaquetteKernelSS{ plaquetteKernelSS(Gaugefield &gauge) : gAcc(gauge.getAccessor()){ } - __host__ __device__ floatT operator()(gSite site) { + __device__ __host__ floatT operator()(gSite site) { typedef GIndexer GInd; GSU3 temp; @@ -53,7 +53,7 @@ struct plaquetteKernel_double{ plaquetteKernel_double(Gaugefield &gauge) : gAcc(gauge.getAccessor()){ } - __host__ __device__ double operator()(gSite site) { + __device__ __host__ double operator()(gSite site) { typedef GIndexer GInd; double result = 0; @@ -77,7 +77,7 @@ struct UtauMinusUsigmaKernel{ UtauMinusUsigmaKernel(Gaugefield &gauge) : gAcc(gauge.getAccessor()){ } - __host__ __device__ floatT operator()(gSite site) { + __device__ __host__ floatT operator()(gSite site) { typedef GIndexer GInd; GSU3 temp; @@ -106,7 +106,7 @@ struct cloverKernel{ cloverKernel(Gaugefield &gauge) : gAcc(gauge.getAccessor()), FT(gAcc){ } - __host__ __device__ floatT operator()(gSite site) { + __device__ __host__ floatT operator()(gSite site) { GSU3 Fmunu; @@ -130,7 +130,7 @@ struct rectangleKernel{ rectangleKernel(Gaugefield &gauge) : gAcc(gauge.getAccessor()){ } - __host__ __device__ floatT operator()(gSite site) { + __device__ __host__ floatT operator()(gSite site) { typedef GIndexer GInd; GSU3 temp; @@ -164,7 +164,7 @@ struct rectangleKernel_double{ rectangleKernel_double(Gaugefield &gauge) : gAcc(gauge.getAccessor()){ } - __host__ __device__ double operator()(gSite site) { + __device__ __host__ double operator()(gSite site) { typedef GIndexer GInd; GSU3 temp; @@ -204,7 +204,7 @@ struct gaugeActKernel_double{ gaugeActKernel_double(Gaugefield &gauge) : gAcc(gauge.getAccessor()){ } - __host__ __device__ double operator()(gSite site) { + __device__ __host__ double operator()(gSite site) { typedef GIndexer GInd; GSU3 m_0; @@ -216,55 +216,53 @@ struct gaugeActKernel_double{ double result = 0; for (int nu = 1; nu < 4; nu++) { for (int mu = 0; mu < nu; mu++) { - { - m_0 = g1_r * gAcc.template getLink( GInd::getSiteMu ( GInd::site_up(site, mu),nu ) ) * // m1 - dagger ( gAcc.template getLink( GInd::getSiteMu ( GInd::site_up(site , nu),mu ) ) ); // m2 - - // - // m2 - // +----+ - // | | - // m3| | - // V |m1 - // | - // | - // e | - // - - m_0 += g2_r * gAcc.template getLink( GInd::getSiteMu ( GInd::site_up(site , mu),nu ) ) * // m1 - gAcc.template getLink( GInd::getSiteMu ( GInd::site_up_up(site , mu, nu),nu ) ) * // m1 - dagger ( gAcc.template getLink( GInd::getSiteMu ( GInd::site_up(site , nu),nu ) ) * // m3 - gAcc.template getLink( GInd::getSiteMu ( GInd::site_2up(site , nu),mu ) ) // m2 - ); - - // - // m3 - // <---------+ - // | - // |m2 - // e -----+ - // m1 - // - - m_0 += g2_r * gAcc.template getLink( GInd::getSiteMu ( GInd::site_up(site , mu),mu ) ) * // m1 - gAcc.template getLink( GInd::getSiteMu ( GInd::site_2up(site , mu),nu ) ) * // m2 - dagger ( gAcc.template getLink( GInd::getSiteMu ( GInd::site_up(site , nu),mu ) ) * // m3 - gAcc.template getLink( GInd::getSiteMu ( GInd::site_up_up(site , mu, nu),mu ) ) // m3 - ); - - // - // | - // m1| - // | - // e----> - // m2 - // - - m_3 = dagger ( gAcc.template getLink( GInd::getSiteMu ( site,nu ) ) ) * // m1 - gAcc.template getLink( GInd::getSiteMu ( site,mu ) ); // m2 - - result += tr_d ( m_3, m_0 ); - } + m_0 = g1_r * gAcc.template getLink( GInd::getSiteMu ( GInd::site_up(site, mu),nu ) ) * // m1 + dagger ( gAcc.template getLink( GInd::getSiteMu ( GInd::site_up(site , nu),mu ) ) ); // m2 + + // + // m2 + // +----+ + // | | + // m3| | + // V |m1 + // | + // | + // e | + // + + m_0 += g2_r * gAcc.template getLink( GInd::getSiteMu ( GInd::site_up(site , mu),nu ) ) * // m1 + gAcc.template getLink( GInd::getSiteMu ( GInd::site_up_up(site , mu, nu),nu ) ) * // m1 + dagger ( gAcc.template getLink( GInd::getSiteMu ( GInd::site_up(site , nu),nu ) ) * // m3 + gAcc.template getLink( GInd::getSiteMu ( GInd::site_2up(site , nu),mu ) ) // m2 + ); + + // + // m3 + // <---------+ + // | + // |m2 + // e -----+ + // m1 + // + + m_0 += g2_r * gAcc.template getLink( GInd::getSiteMu ( GInd::site_up(site , mu),mu ) ) * // m1 + gAcc.template getLink( GInd::getSiteMu ( GInd::site_2up(site , mu),nu ) ) * // m2 + dagger ( gAcc.template getLink( GInd::getSiteMu ( GInd::site_up(site , nu),mu ) ) * // m3 + gAcc.template getLink( GInd::getSiteMu ( GInd::site_up_up(site , mu, nu),mu ) ) // m3 + ); + + // + // | + // m1| + // | + // e----> + // m2 + // + + m_3 = dagger ( gAcc.template getLink( GInd::getSiteMu ( site,nu ) ) ) * // m1 + gAcc.template getLink( GInd::getSiteMu ( site,mu ) ); // m2 + + result += tr_d ( m_3, m_0 ); } } return result; diff --git a/src/gauge/gaugefield.h b/src/gauge/gaugefield.h index 09ae11e8..46da4e79 100644 --- a/src/gauge/gaugefield.h +++ b/src/gauge/gaugefield.h @@ -139,7 +139,7 @@ struct convert_prec { convert_prec(Gaugefield &gaugeIn) : gAcc_source(gaugeIn.getAccessor()) {} - __host__ __device__ GSU3 operator()(gSiteMu site) { + __device__ __host__ GSU3 operator()(gSiteMu site) { return gAcc_source.template getLink(site); } }; diff --git a/src/gauge/gaugefield_device.cpp b/src/gauge/gaugefield_device.cpp index 0a98eee5..be982400 100644 --- a/src/gauge/gaugefield_device.cpp +++ b/src/gauge/gaugefield_device.cpp @@ -19,7 +19,7 @@ struct fill_with_rand __host__ __device__ void initialize(__attribute__((unused)) gSite site){ } - __host__ __device__ GSU3 operator()(gSite site, __attribute__((unused)) size_t mu){ + __device__ __host__ GSU3 operator()(gSite site, __attribute__((unused)) size_t mu){ my_mat.random(&_rand_state[site.isite]); return my_mat; } @@ -35,7 +35,7 @@ struct fill_with_gauss { __host__ __device__ void initialize(__attribute__((unused)) gSite site) { } - __host__ __device__ GSU3 operator()(gSite site, __attribute__((unused)) size_t mu) { + __device__ __host__ GSU3 operator()(gSite site, __attribute__((unused)) size_t mu) { my_mat.gauss(&_rand_state[site.isite]); return my_mat; } @@ -47,7 +47,7 @@ struct UnitKernel{ gaugeAccessor gaugeAcc; explicit UnitKernel(Gaugefield& gauge) : gaugeAcc(gauge.getAccessor()){} - __host__ __device__ GSU3 operator()(gSiteMu siteMu){ + __device__ __host__ GSU3 operator()(gSiteMu siteMu){ typedef GIndexer GInd; GSU3 temp; temp=gaugeAcc.template getLink(siteMu); diff --git a/src/modules/observables/FieldStrengthTensor.h b/src/modules/observables/FieldStrengthTensor.h index 9ae59f84..fef56346 100644 --- a/src/modules/observables/FieldStrengthTensor.h +++ b/src/modules/observables/FieldStrengthTensor.h @@ -26,7 +26,7 @@ struct plaqClover { plaqClover(gaugeAccessor acc) : acc(acc) {} - __host__ __device__ inline GSU3 operator()(gSite site, int mu, int nu) { + __device__ __host__ inline GSU3 operator()(gSite site, int mu, int nu) { return Plaq_P(acc, site, mu, nu) + Plaq_Q(acc, site, mu, nu) @@ -43,7 +43,7 @@ struct rectClover { rectClover(gaugeAccessor acc) : acc(acc) {} - __host__ __device__ inline GSU3 operator()(gSite site, int mu, int nu) { + __device__ __host__ inline GSU3 operator()(gSite site, int mu, int nu) { gSite origin = site; gSite up = GInd::site_up(site, nu); gSite twoUp = GInd::site_up(up, nu); @@ -154,7 +154,7 @@ struct FieldStrengthTensor { FieldStrengthTensor(gaugeAccessor acc) : acc(acc), plClov(acc) {} - __host__ __device__ inline GSU3 operator()(gSite site, int mu, int nu) { + __device__ __host__ inline GSU3 operator()(gSite site, int mu, int nu) { //define a unitary matrix for the addition in the end GSU3 unityGSU3 = gsu3_one(); @@ -186,7 +186,7 @@ struct FieldStrengthTensor_imp { FieldStrengthTensor_imp(gaugeAccessor acc) : acc(acc), plClov(acc), rcClov(acc) {} - __host__ __device__ inline GSU3 operator()(gSite site, int mu, int nu) { + __device__ __host__ inline GSU3 operator()(gSite site, int mu, int nu) { //define a unitary matrix for the addition in the end GSU3 unityGSU3 = gsu3_one(); diff --git a/src/spinor/spinorfield.h b/src/spinor/spinorfield.h index 4d32ea90..cac145c2 100644 --- a/src/spinor/spinorfield.h +++ b/src/spinor/spinorfield.h @@ -450,7 +450,7 @@ struct convert_spinor_precision { convert_spinor_precision(Spinorfield &spinorIn) : spinor_source(spinorIn.getAccessor()) {} - __host__ __device__ auto operator()(gSiteStack site) { + __device__ __host__ auto operator()(gSiteStack site) { return spinor_source.template getElement(site); }