meifeng · vlkale · Jan 27, 2022 · Jan 27, 2022 · Jan 27, 2022 · Jan 27, 2022
diff --git a/Grid/threads/Pragmas.h b/Grid/threads/Pragmas.h
@@ -36,6 +36,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define strong_inline     __attribute__((always_inline)) inline
 #define UNROLL  _Pragma("unroll")
 
+#define OMP_UROLL_FACT 4
 //////////////////////////////////////////////////////////////////////////////////
 // New primitives; explicit host thread calls, and accelerator data parallel calls
 //////////////////////////////////////////////////////////////////////////////////
@@ -132,13 +133,15 @@ extern uint32_t gpu_threads;
 #define accelerator_for(iterator,num,nsimd, ... )  \
 {                                                  \
 	uint32_t nteams=(num+gpu_threads-1)/gpu_threads;  \
-       	_Pragma("omp target teams distribute parallel for num_teams(nteams) thread_limit(gpu_threads)") \
+	uint32_t unroll_factor = OMP_UROLL_FACT;
+       	_Pragma("omp target teams distribute parallel for num_teams(nteams) thread_limit(gpu_threads) unroll partial(unroll_factor)") \
 	naked_for(iterator, num, { __VA_ARGS__ }); \
   }
 #define accelerator_forNB(iterator,num,nsimd, ... ) \
   {						    \
   	uint32_t nteams=(num+gpu_threads-1)/gpu_threads;  \
-        _Pragma("omp target teams distribute parallel for num_teams(nteams) thread_limit(gpu_threads)") \
+	uint32_t unroll_factor = OMP_UROLL_FACT;
+        _Pragma("omp target teams distribute parallel for num_teams(nteams) thread_limit(gpu_threads) unroll partial(unroll_factor)") \
         naked_for(iterator, num, { __VA_ARGS__ }); \
   }
 

diff --git a/Makefile b/Makefile
@@ -11,9 +11,10 @@ MAIN=Benchmark_su3
 #CXX=pgc++
 #CXXFLAGS=-fast --c++14 -acc -Mnollvm -Minfo=accel -ta=tesla:cc70,managed -Mlarge_arrays --no_exceptions
 
+## Add flags for forcing no compiler-automated loop unroll
 ##Clang
 CXX=clang++
-CXXFLAGS=-std=c++14 -g -fopenmp -fopenmp-cuda-mode  -O3 -fopenmp-targets=nvptx64-nvidia-cuda -lcudart
+CXXFLAGS=-std=c++14 -g -fopenmp -fopenmp-cuda-mode  -O3 -fopenmp-targets=nvptx64-nvidia-cuda -lcudart -fno-exceptions -march=native -fopenmp-version=51 -fno-unroll-loops -fno-vectorize -llvm_info -Rpass=loop-unroll
 CXXFLAGS += -DOMPTARGET 
 CXXFLAGS +=-DOMPTARGET_MANAGED
 #CXXFLAGS += -DVECTOR_LOOPS

diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc
@@ -31,6 +31,15 @@ Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 using namespace std;
 using namespace Grid;
 
+
+#define TILE_SZ 4
+
+#define UNROLL_FACTOR 2
+
+#define TILE
+#define UNROLL
+//#define OMP_TILE
+//#define OMP_UNROLL
 int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
@@ -132,8 +141,33 @@ int main (int argc, char ** argv)
 
       for(int64_t i=0;i<Nloop;i++){
       #pragma omp target teams distribute parallel for
-	for(int64_t s=0;s<vol;s++){
-          zv[s]=xv[s]*yv[s];
+
+
+#ifdef UNROLL
+	 for(int64_t s=0;s<vol;s+=UNROLL_FACTOR) { 
+		 zv[s]=xv[s]*yv[s];
+		 zv[s+1]=xv[s+1]*yv[s+1];
+	 }
+#endif
+#ifdef OMP_UNROLL
+	  #pragma omp unroll partial(UNROLL_FACTOR)
+	 for(int64_t s=0;s<vol;s++)
+		 zv[s]=xv[s]*yv[s];
+#endif
+
+
+#ifdef TILE
+	 for(int64_t s=0;s<vol;s+=TILE_SZ) {
+          for (int64_t t = s; t< min(s+TILE_SZ, vol); t++)
+             zv[t]=xv[t]*yv[t];
+	 }
+#endif
+
+#ifdef OMP_TILE
+	  #pragma omp tile sizes(TILE_SZ)
+	 for(int64_t s=0;s<vol;s++)
+		 zv[s]=xv[s]*yv[s];
+#endif
         }
       }