Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modifications with tile and unroll for GridMini #5

Open
wants to merge 8 commits into
base: clang-omp
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions Grid/threads/Pragmas.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define strong_inline __attribute__((always_inline)) inline
#define UNROLL _Pragma("unroll")

#define OMP_UROLL_FACT 4
//////////////////////////////////////////////////////////////////////////////////
// New primitives; explicit host thread calls, and accelerator data parallel calls
//////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -132,13 +133,15 @@ extern uint32_t gpu_threads;
#define accelerator_for(iterator,num,nsimd, ... ) \
{ \
uint32_t nteams=(num+gpu_threads-1)/gpu_threads; \
_Pragma("omp target teams distribute parallel for num_teams(nteams) thread_limit(gpu_threads)") \
uint32_t unroll_factor = OMP_UROLL_FACT;
_Pragma("omp target teams distribute parallel for num_teams(nteams) thread_limit(gpu_threads) unroll partial(unroll_factor)") \
naked_for(iterator, num, { __VA_ARGS__ }); \
}
#define accelerator_forNB(iterator,num,nsimd, ... ) \
{ \
uint32_t nteams=(num+gpu_threads-1)/gpu_threads; \
_Pragma("omp target teams distribute parallel for num_teams(nteams) thread_limit(gpu_threads)") \
uint32_t unroll_factor = OMP_UROLL_FACT;
_Pragma("omp target teams distribute parallel for num_teams(nteams) thread_limit(gpu_threads) unroll partial(unroll_factor)") \
naked_for(iterator, num, { __VA_ARGS__ }); \
}

Expand Down
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@ MAIN=Benchmark_su3
#CXX=pgc++
#CXXFLAGS=-fast --c++14 -acc -Mnollvm -Minfo=accel -ta=tesla:cc70,managed -Mlarge_arrays --no_exceptions

## Add flags for forcing no compiler-automated loop unroll
##Clang
CXX=clang++
CXXFLAGS=-std=c++14 -g -fopenmp -fopenmp-cuda-mode -O3 -fopenmp-targets=nvptx64-nvidia-cuda -lcudart
CXXFLAGS=-std=c++14 -g -fopenmp -fopenmp-cuda-mode -O3 -fopenmp-targets=nvptx64-nvidia-cuda -lcudart -fno-exceptions -march=native -fopenmp-version=51 -fno-unroll-loops -fno-vectorize -llvm_info -Rpass=loop-unroll
CXXFLAGS += -DOMPTARGET
CXXFLAGS +=-DOMPTARGET_MANAGED
#CXXFLAGS += -DVECTOR_LOOPS
Expand Down
38 changes: 36 additions & 2 deletions benchmarks/Benchmark_su3.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,15 @@ Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
using namespace std;
using namespace Grid;


#define TILE_SZ 4

#define UNROLL_FACTOR 2

#define TILE
#define UNROLL
//#define OMP_TILE
//#define OMP_UNROLL
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
Expand Down Expand Up @@ -132,8 +141,33 @@ int main (int argc, char ** argv)

for(int64_t i=0;i<Nloop;i++){
#pragma omp target teams distribute parallel for
for(int64_t s=0;s<vol;s++){
zv[s]=xv[s]*yv[s];


#ifdef UNROLL
for(int64_t s=0;s<vol;s+=UNROLL_FACTOR) {
zv[s]=xv[s]*yv[s];
zv[s+1]=xv[s+1]*yv[s+1];
}
#endif
#ifdef OMP_UNROLL
#pragma omp unroll partial(UNROLL_FACTOR)
for(int64_t s=0;s<vol;s++)
zv[s]=xv[s]*yv[s];
#endif


#ifdef TILE
for(int64_t s=0;s<vol;s+=TILE_SZ) {
for (int64_t t = s; t< min(s+TILE_SZ, vol); t++)
zv[t]=xv[t]*yv[t];
}
#endif

#ifdef OMP_TILE
#pragma omp tile sizes(TILE_SZ)
for(int64_t s=0;s<vol;s++)
zv[s]=xv[s]*yv[s];
#endif
}
}

Expand Down