diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..0bdb553c --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "wmma_extension"] + path = wmma_extension + url = https://github.com/wmmae/wmma_extension diff --git a/Makefile b/Makefile index 3e44cdc1..3b41fa03 100755 --- a/Makefile +++ b/Makefile @@ -27,7 +27,11 @@ OVERLAP = ON ifeq ($(DEVICE), $(filter $(DEVICE),GPU CUDA)) -TARGETS_SUPPORTED := $(shell ./test_cuda.sh nvcc "$(GPU_INCLUDE_PATH)" "$(GPU_LIBRARY_PATH)" "$(TARGETS)" "$(DEVICE)") +MIN_COMPUTE:=50 +ifeq ($(TENSOR), ON) +MIN_COMPUTE:=80 +endif +TARGETS_SUPPORTED := $(shell ./test_cuda.sh nvcc "$(GPU_INCLUDE_PATH)" "$(GPU_LIBRARY_PATH)" "$(TARGETS)" "$(MIN_COMPUTE)") # if user specifies DEVICE=GPU the test result determines wether CUDA will be used or not ifeq ($(TARGETS_SUPPORTED),) ifeq ($(DEVICE),CUDA) diff --git a/Makefile.Cuda b/Makefile.Cuda index 472816d4..d6c8901d 100644 --- a/Makefile.Cuda +++ b/Makefile.Cuda @@ -16,6 +16,13 @@ CPP = g++ UNAME := $(shell uname) TARGETS = 52 60 61 70 80 + +ifeq ($(TENSOR), ON) + NVTENSOR=-DUSE_NVTENSOR + NVTENSOR+=-I./wmma_extension/include + TARGETS = 80 +endif + CUDA_TARGETS=$(foreach target,$(TARGETS),-gencode arch=compute_$(target),code=sm_$(target)) ifeq ($(DEVICE), CPU) @@ -54,7 +61,6 @@ ifeq ($(OVERLAP), ON) PIPELINE=-DUSE_PIPELINE -fopenmp endif - BIN := $(wildcard $(TARGET)*) # ------------------------------------------------------ @@ -63,6 +69,9 @@ BIN := $(wildcard $(TARGET)*) NUMWI= ifeq ($(NUMWI), 32) +ifeq ($(TENSOR), ON) +$(error NUMWI needs to be at least 64 with TENSOR=ON) +endif NWI=-DN32WI TARGET:=$(TARGET)_32wi else ifeq ($(NUMWI), 64) @@ -177,7 +186,7 @@ TOOL_CFLAGS+=-DVERSION=\"$(GIT_VERSION)\" # ------------------------------------------------------ kernels: $(KERNEL_SRC) - $(NVCC) $(NWI) $(REP) $(CUDA_FLAGS) $(IFLAGS) $(CUDA_INCLUDES) -c $(KRNL_DIR)/kernels.cu + $(NVCC) $(NWI) $(REP) $(CUDA_FLAGS) $(IFLAGS) $(CUDA_INCLUDES) $(NVTENSOR) -c $(KRNL_DIR)/kernels.cu otool: @echo "Building" $(TOOL_TARGET) "..." @@ -185,7 +194,7 @@ otool: $(shell ls $(HOST_SRC_DIR)/*.cpp) \ $(TOOL_CFLAGS) \ -o$(BIN_DIR)/$(TOOL_TARGET) \ - $(PIPELINE) $(OPT) -DTOOLMODE $(REP) + $(PIPELINE) $(NVTENSOR) $(OPT) -DTOOLMODE $(REP) odock: check-env-all kernels @echo "Building" $(TARGET) "..." @@ -194,7 +203,7 @@ odock: check-env-all kernels $(CFLAGS) \ $(LIB_CUDA) \ -o$(BIN_DIR)/$(TARGET) \ - $(DEV) $(NWI) $(PIPELINE) $(OPT) $(DD) $(REP) $(KFLAGS) + $(DEV) $(NWI) $(PIPELINE) $(NVTENSOR) $(OPT) $(DD) $(REP) $(KFLAGS) # Example # 1ac8: for testing gradients of translation and rotation genes diff --git a/cuda/calcMergeEneGra.cu b/cuda/calcMergeEneGra.cu index 1e2f0d54..7b19a894 100644 --- a/cuda/calcMergeEneGra.cu +++ b/cuda/calcMergeEneGra.cu @@ -642,23 +642,77 @@ __device__ void gpu_calc_energrad( torque_rot.z += tr.z; } - // Do a reduction over the total gradient containing prepared "gradient_intra_*" values +#ifdef USE_NVTENSOR + /* Begin: Reduction using tensor units */ + + // Implementation based on M.Sc. thesis by Gabin Schieffer at KTH: + // "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx" + // https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf + + // 1. Convert data-to-be-reduced from float to half + // and place it in a shared memory array + __shared__ __align__(256) float data_to_be_reduced[4*NUM_OF_THREADS_PER_BLOCK]; + data_to_be_reduced[4*threadIdx.x] = torque_rot.x; + data_to_be_reduced[4*threadIdx.x + 1] = torque_rot.y; + data_to_be_reduced[4*threadIdx.x + 2] = torque_rot.z; + data_to_be_reduced[4*threadIdx.x + 3] = energy; + + // 2. Perform reduction via tensor units + reduce_via_tensor_units(data_to_be_reduced); + + // 3. Retrieve results from shared memory + torque_rot.x = data_to_be_reduced[0]; + torque_rot.y = data_to_be_reduced[1]; + torque_rot.z = data_to_be_reduced[2]; + energy = data_to_be_reduced[3]; + + /* End: Reduction using tensor units */ +#else + // Reduction over the total gradient containing prepared "gradient_intra_*" values REDUCEFLOATSUM(torque_rot.x, pFloatAccumulator); REDUCEFLOATSUM(torque_rot.y, pFloatAccumulator); REDUCEFLOATSUM(torque_rot.z, pFloatAccumulator); + // Reduction over partial energies and prepared "gradient_intra_*" values + REDUCEFLOATSUM(energy, pFloatAccumulator); +#endif + // TODO // ------------------------------------------------------- // Obtaining energy and translation-related gradients // ------------------------------------------------------- - // reduction over partial energies and prepared "gradient_intra_*" values - REDUCEFLOATSUM(energy, pFloatAccumulator); + #if defined (DEBUG_ENERGY_KERNEL) REDUCEFLOATSUM(intraE, pFloatAccumulator); #endif + +#ifdef USE_NVTENSOR + /* Begin: Reduction using tensor units */ + + // Implementation based on M.Sc. thesis by Gabin Schieffer at KTH: + // "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx" + // https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf + + // 1. Convert data-to-be-reduced from float to half + // and place it in a shared memory array + data_to_be_reduced[4*threadIdx.x] = gx; + data_to_be_reduced[4*threadIdx.x + 1] = gy; + data_to_be_reduced[4*threadIdx.x + 2] = gz; + + // 2. Perform reduction via tensor units + reduce_via_tensor_units(data_to_be_reduced); + + // 3. Retrieve results from shared memory + gx = data_to_be_reduced[0]; + gy = data_to_be_reduced[1]; + gz = data_to_be_reduced[2]; + + /* End: Reduction using tensor units */ +#else REDUCEFLOATSUM(gx, pFloatAccumulator); REDUCEFLOATSUM(gy, pFloatAccumulator); REDUCEFLOATSUM(gz, pFloatAccumulator); +#endif global_energy = energy; #ifndef FLOAT_GRADIENTS diff --git a/cuda/kernels.cu b/cuda/kernels.cu index c98c69b5..f1560cbf 100644 --- a/cuda/kernels.cu +++ b/cuda/kernels.cu @@ -93,6 +93,90 @@ __device__ inline int64_t ullitolli(uint64_t u) #define ATOMICADDF32(pAccumulator, value) atomicAdd(pAccumulator, (value)) #define ATOMICSUBF32(pAccumulator, value) atomicAdd(pAccumulator, -(value)) +#ifdef USE_NVTENSOR +/* Begin: Reduction using tensor units */ + +// Implementation based on M.Sc. thesis by Gabin Schieffer at KTH: +// "Accelerating a Molecular Docking Application by Leveraging Modern Heterogeneous Computing Systemx" +// https://www.diva-portal.org/smash/get/diva2:1786161/FULLTEXT01.pdf + + /* + * WMMA Extension for single precision matmul using Tensor Cores + * and error correction technique (TCEC) + * https://github.com/wmmae/wmma_extension/blob/main/docs/mma_f32.md + */ + #include + using tf32 = nvcuda::wmma::precision::tf32; + +/* + * Tensor Cores + * https://developer.nvidia.com/blog/programming-tensor-cores-cuda-9 + * + * Don't forget to compile specifying the architecture, e.g., sm_86. + * For AutoDock-GPU, this can be done via the TARGETS option. + * make DEVICE=GPU TESTLS=ad NUMWI=64 TARGETS=86 test + * https://stackoverflow.com/a/53634598/1616865 + */ +#include +using namespace nvcuda; + +#define TILE_SIZE (16 * 16) + +constexpr int rowscols_M = 16; // Number of rows (or cols) in the M dimension +constexpr int rowscols_N = 16; // Number of rows (or cols) in the N dimension +constexpr int rowscols_K = 16; // Number of rows (or cols) in the K dimension + +__device__ void reduce_via_tensor_units(float *data_to_be_reduced) { + __syncthreads(); + + if (threadIdx.x <= 31) { // Only one warp performs reduction + __shared__ __align__ (256) float Q_square[TILE_SIZE]; // storage for 16x16 matrix and 4x4 tiles of I4 matrix after + + // Declaring and filling fragments - Those are *not* shared + mtk::wmma::tcec::fragment frag_P; + mtk::wmma::tcec::fragment frag_V; + + mtk::wmma::tcec::fragment frag_Q; + mtk::wmma::tcec::fragment frag_W; + mtk::wmma::tcec::fragment frag_C; + + mtk::wmma::tcec::fill_fragment(frag_P, 1.0f); // P: only ones + mtk::wmma::tcec::fill_fragment(frag_V, 0.0f); // Output: initialize to zeros + mtk::wmma::tcec::fill_fragment(frag_C, 0.0f); // Final result + + // 1. Accumulate the values: V <- AP + V + for(uint i = 0; i < (4 * NUM_OF_THREADS_PER_BLOCK)/TILE_SIZE; i++){ + const unsigned int offset = i * TILE_SIZE; + + mtk::wmma::tcec::fragment frag_A; + mtk::wmma::tcec::load_matrix_sync(frag_A, data_to_be_reduced + offset, 16); + mtk::wmma::tcec::mma_sync(frag_V, frag_A, frag_P, frag_V); + } + + // W <- V (required since we need V as a "wmma::matrix_b") + mtk::wmma::tcec::store_matrix_sync(Q_square, frag_V, 16, wmma::mem_col_major); + mtk::wmma::tcec::load_matrix_sync(frag_W, Q_square, 16); + + // 2. Perform line sum: C <- QW + C (zero) + // a) create a 4x4 tiled matrix containing 4x4 identity matrix in each tile: + // - TENSOR=ON requires NUMWI to be larger than 32, so the following works and neatly gets rid of an additional function: + const unsigned int k = (threadIdx.x<<3); + const unsigned int kk = 16 - (threadIdx.x>>1); + for(uint i = 0; i < 8; i++) Q_square[k + i] = ((i + kk) & 3) ? 0.0f : 1.0f; + mtk::wmma::tcec::load_matrix_sync(frag_Q, Q_square, 16); + // b) perform sum + mtk::wmma::tcec::mma_sync(frag_C, frag_Q, frag_W, frag_C); + + // 3. Store result in shared memory + mtk::wmma::tcec::store_matrix_sync(data_to_be_reduced, frag_C, 16, wmma::mem_col_major); + } + + __syncthreads(); +} + +/* End: Reduction using tensor units */ +#endif + #define REDUCEFLOATSUM(value, pAccumulator) \ if (threadIdx.x == 0) \ { \ diff --git a/host/src/performdocking.cpp b/host/src/performdocking.cpp index 2375386c..a6e3d661 100644 --- a/host/src/performdocking.cpp +++ b/host/src/performdocking.cpp @@ -450,6 +450,13 @@ void setup_gpu_for_docking( cudaDeviceProp props; RTERROR(cudaGetDevice(&(cData.devnum)),"ERROR in cudaGetDevice:"); RTERROR(cudaGetDeviceProperties(&props,cData.devnum),"ERROR in cudaGetDeviceProperties:"); +#ifdef USE_NVTENSOR + if(props.major < 8){ + printf("Error: Compute capability 8.0 or higher is needed for tensor core sum reductions.\n"); + printf(" Available device %s has compute capability %d.%d.\n", props.name, props.major, props.minor); + exit(-1); + } +#endif tData.device_name = (char*) malloc(strlen(props.name)+32); // make sure array is large enough to hold device number text too strcpy(tData.device_name, props.name); if(gpuCount>1) snprintf(&tData.device_name[strlen(props.name)], strlen(props.name)+32, " (#%d / %d)",cData.devnum+1,gpuCount); diff --git a/test_cuda.sh b/test_cuda.sh index 12fb3d86..e39cceb0 100755 --- a/test_cuda.sh +++ b/test_cuda.sh @@ -26,7 +26,7 @@ if [[ "$4" != "" ]]; then done TARGETS="$4" else - TARGETS=`awk -F'_' '{ if(\$2>50) print \$2 }' <<< "$TARGETS_SUPPORTED" | tr "\n" " "` + TARGETS=`awk -F'_' "{ if(\\$2>=$5) print \\$2 }" <<< "$TARGETS_SUPPORTED" | tr "\n" " "` fi printf "Compiling for targets: %s\n" "$TARGETS" >&2 cd "$script_dir" diff --git a/wmma_extension b/wmma_extension new file mode 160000 index 00000000..7175b5f4 --- /dev/null +++ b/wmma_extension @@ -0,0 +1 @@ +Subproject commit 7175b5f4feea2a16b8d5b3045a357db010583c5e