diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c index f01ec95f8c..5991559cd3 100644 --- a/backends/cuda-shared/ceed-cuda-shared-basis.c +++ b/backends/cuda-shared/ceed-cuda-shared-basis.c @@ -18,13 +18,6 @@ #include "../cuda/ceed-cuda-compile.h" #include "ceed-cuda-shared.h" -//------------------------------------------------------------------------------ -// Device initalization -//------------------------------------------------------------------------------ -int CeedInit_CudaInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B); -int CeedInit_CudaGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr); -int CeedInit_CudaCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr); - //------------------------------------------------------------------------------ // Apply tensor basis //------------------------------------------------------------------------------ @@ -58,8 +51,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); - CeedCallBackend(CeedInit_CudaInterp(data->d_interp_1d, P_1d, Q_1d, &data->c_B)); - void *interp_args[] = {(void *)&num_elem, &data->c_B, &d_u, &d_v}; + void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v}; if (dim == 1) { // avoid >512 total threads @@ -104,14 +96,14 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); - CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); + CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); + CeedScalar *d_grad_1d = data->d_grad_1d; if (data->d_collo_grad_1d) { - CeedCallBackend(CeedInit_CudaCollocatedGrad(data->d_interp_1d, data->d_collo_grad_1d, P_1d, Q_1d, &data->c_B, &data->c_G)); - } else { - CeedCallBackend(CeedInit_CudaGrad(data->d_interp_1d, data->d_grad_1d, P_1d, Q_1d, &data->c_B, &data->c_G)); + d_grad_1d = data->d_collo_grad_1d; } - void *grad_args[] = {(void *)&num_elem, &data->c_B, &data->c_G, &d_u, &d_v}; + void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_grad_1d, &d_u, &d_v}; + if (dim == 1) { // avoid >512 total threads CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); @@ -328,8 +320,7 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); - CeedCallBackend(CeedInit_CudaInterp(data->d_chebyshev_interp_1d, P_1d, Q_1d, &data->c_B)); - void *interp_args[] = {(void *)&num_elem, &data->c_B, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; if (dim == 1) { // avoid >512 total threads @@ -364,7 +355,6 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); - CeedCallBackend(CeedInit_CudaInterp(data->d_chebyshev_interp_1d, P_1d, Q_1d, &data->c_B)); void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; if (dim == 1) { @@ -456,8 +446,7 @@ static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_a CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); CeedInt thread = CeedIntMax(Q, P); - CeedCallBackend(CeedInit_CudaInterp(data->d_interp_1d, P, Q, &data->c_B)); - void *interp_args[] = {(void *)&num_elem, &data->c_B, &d_u, &d_v}; + void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v}; { // avoid >512 total threads @@ -480,8 +469,7 @@ static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_a CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); CeedInt thread = CeedIntMax(Q, P); - CeedCallBackend(CeedInit_CudaInterp(data->d_grad_1d, P, Q * dim, &data->c_G)); - void *grad_args[] = {(void *)&num_elem, &data->c_G, &d_u, &d_v}; + void *grad_args[] = {(void *)&num_elem, &data->d_grad_1d, &d_u, &d_v}; { // avoid >512 total threads @@ -642,6 +630,10 @@ int CeedBasisCreateH1_Cuda_shared(CeedElemTopology topo, CeedInt dim, CeedInt nu CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedCalloc(1, &data)); + // Check max sizes + CeedCheck(dim <= 3, ceed, CEED_ERROR_BACKEND, "Backend does not implement nontensor bases with dim > 3"); + CeedCheck(num_nodes * num_qpts * dim < 52 * 52 * 3, ceed, CEED_ERROR_BACKEND, "Backend does not implement nontensor bases with P * Q this large"); + // Copy basis data to GPU CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); diff --git a/backends/cuda-shared/kernels/cuda-shared-basis.cu b/backends/cuda-shared/kernels/cuda-shared-basis.cu deleted file mode 100644 index f654f7ddda..0000000000 --- a/backends/cuda-shared/kernels/cuda-shared-basis.cu +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include -#include - -const int sizeMax = 16; -__constant__ CeedScalar c_B[sizeMax * sizeMax]; -__constant__ CeedScalar c_G[sizeMax * sizeMax]; - -//------------------------------------------------------------------------------ -// Interp device initialization -//------------------------------------------------------------------------------ -extern "C" int CeedInit_CudaInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr) { - const int bytes = P_1d * Q_1d * sizeof(CeedScalar); - - cudaMemcpyToSymbol(c_B, d_B, bytes, 0, cudaMemcpyDeviceToDevice); - cudaGetSymbolAddress((void **)c_B_ptr, c_B); - return CEED_ERROR_SUCCESS; -} - -//------------------------------------------------------------------------------ -// Grad device initialization -//------------------------------------------------------------------------------ -extern "C" int CeedInit_CudaGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr) { - const int bytes = P_1d * Q_1d * sizeof(CeedScalar); - - cudaMemcpyToSymbol(c_B, d_B, bytes, 0, cudaMemcpyDeviceToDevice); - cudaGetSymbolAddress((void **)c_B_ptr, c_B); - cudaMemcpyToSymbol(c_G, d_G, bytes, 0, cudaMemcpyDeviceToDevice); - cudaGetSymbolAddress((void **)c_G_ptr, c_G); - return CEED_ERROR_SUCCESS; -} - -//------------------------------------------------------------------------------ -// Collocated grad device initialization -//------------------------------------------------------------------------------ -extern "C" int CeedInit_CudaCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr) { - const int bytes_interp = P_1d * Q_1d * sizeof(CeedScalar); - const int bytes_grad = Q_1d * Q_1d * sizeof(CeedScalar); - - cudaMemcpyToSymbol(c_B, d_B, bytes_interp, 0, cudaMemcpyDeviceToDevice); - cudaGetSymbolAddress((void **)c_B_ptr, c_B); - cudaMemcpyToSymbol(c_G, d_G, bytes_grad, 0, cudaMemcpyDeviceToDevice); - cudaGetSymbolAddress((void **)c_G_ptr, c_G); - return CEED_ERROR_SUCCESS; -} - -//------------------------------------------------------------------------------ diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c index 90357211ad..a94ef0c868 100644 --- a/backends/hip-shared/ceed-hip-shared-basis.c +++ b/backends/hip-shared/ceed-hip-shared-basis.c @@ -170,6 +170,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add, d_grad_1d = data->d_collo_grad_1d; } void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_grad_1d, &d_u, &d_v}; + if (dim == 1) { CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; elems_per_block = elems_per_block > 0 ? elems_per_block : 1; diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h index 4495ed0381..2187632e0d 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h @@ -28,9 +28,15 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, CeedScalar r_U[BASIS_NUM_COMP]; CeedScalar r_V[BASIS_NUM_COMP]; + // load interp into shared memory + __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { ReadElementStrided1d(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U); - InterpNonTensor(data, r_U, c_B, r_V); + InterpNonTensor(data, r_U, s_B, r_V); WriteElementStrided1d(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V); } } @@ -49,9 +55,15 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca CeedScalar r_U[BASIS_NUM_COMP]; CeedScalar r_V[BASIS_NUM_COMP]; + // load interp into shared memory + __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { ReadElementStrided1d(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); - InterpTransposeNonTensor(data, r_U, c_B, r_V); + InterpTransposeNonTensor(data, r_U, s_B, r_V); WriteElementStrided1d(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); } } @@ -70,9 +82,15 @@ extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const Ceed CeedScalar r_U[BASIS_NUM_COMP]; CeedScalar r_V[BASIS_NUM_COMP]; + // load interp into shared memory + __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { ReadElementStrided1d(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); - InterpTransposeNonTensor(data, r_U, c_B, r_V); + InterpTransposeNonTensor(data, r_U, s_B, r_V); SumElementStrided1d(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); } } @@ -93,9 +111,15 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, c CeedScalar r_U[BASIS_NUM_COMP]; CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; + // load grad into shared memory + __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { ReadElementStrided1d(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U); - GradNonTensor(data, r_U, c_G, r_V); + GradNonTensor(data, r_U, s_G, r_V); WriteElementStrided1d(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V); } } @@ -114,9 +138,15 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; CeedScalar r_V[BASIS_NUM_COMP]; + // load grad into shared memory + __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { ReadElementStrided1d(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); - GradTransposeNonTensor(data, r_U, c_G, r_V); + GradTransposeNonTensor(data, r_U, s_G, r_V); WriteElementStrided1d(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); } } @@ -135,9 +165,15 @@ extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedSc CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; CeedScalar r_V[BASIS_NUM_COMP]; + // load grad into shared memory + __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { ReadElementStrided1d(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); - GradTransposeNonTensor(data, r_U, c_G, r_V); + GradTransposeNonTensor(data, r_U, s_G, r_V); SumElementStrided1d(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); } } diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h index 49e7eca873..066f95ed58 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h @@ -9,6 +9,14 @@ /// Internal header for CUDA shared memory basis read/write templates #include +//------------------------------------------------------------------------------ +// Load matrices for basis actions +//------------------------------------------------------------------------------ +template +inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { + for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; +} + //------------------------------------------------------------------------------ // 1D //------------------------------------------------------------------------------ diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h index cd9021611a..d0cc602be9 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h @@ -36,19 +36,24 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScal CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP]; + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { // Map to coefficients if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); - Interp1d(data, r_U, c_B, r_C); + Interp1d(data, r_U, s_B, r_C); } else if (BASIS_DIM == 2) { ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); - InterpTensor2d(data, r_U, c_B, r_C); + InterpTensor2d(data, r_U, s_B, r_C); } else if (BASIS_DIM == 3) { ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); - InterpTensor3d(data, r_U, c_B, r_C); + InterpTensor3d(data, r_U, s_B, r_C); } // Map to points @@ -87,6 +92,11 @@ extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { // Clear register @@ -112,13 +122,13 @@ extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const // Map from coefficients if (BASIS_DIM == 1) { - InterpTranspose1d(data, r_C, c_B, r_V); + InterpTranspose1d(data, r_C, s_B, r_V); SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 2) { - InterpTransposeTensor2d(data, r_C, c_B, r_V); + InterpTransposeTensor2d(data, r_C, s_B, r_V); SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 3) { - InterpTransposeTensor3d(data, r_C, c_B, r_V); + InterpTransposeTensor3d(data, r_C, s_B, r_V); SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); } @@ -144,19 +154,24 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { // Map to coefficients if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); - Interp1d(data, r_U, c_B, r_C); + Interp1d(data, r_U, s_B, r_C); } else if (BASIS_DIM == 2) { ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); - InterpTensor2d(data, r_U, c_B, r_C); + InterpTensor2d(data, r_U, s_B, r_C); } else if (BASIS_DIM == 3) { ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); - InterpTensor3d(data, r_U, c_B, r_C); + InterpTensor3d(data, r_U, s_B, r_C); } // Map to points @@ -195,6 +210,11 @@ extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const C CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { // Clear register @@ -221,13 +241,13 @@ extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const C // Map from coefficients if (BASIS_DIM == 1) { - InterpTranspose1d(data, r_C, c_B, r_V); + InterpTranspose1d(data, r_C, s_B, r_V); SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 2) { - InterpTransposeTensor2d(data, r_C, c_B, r_V); + InterpTransposeTensor2d(data, r_C, s_B, r_V); SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 3) { - InterpTransposeTensor3d(data, r_C, c_B, r_V); + InterpTransposeTensor3d(data, r_C, s_B, r_V); SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); } diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h index 9b80043996..a70481fb55 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h @@ -28,19 +28,25 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); - Interp1d(data, r_U, c_B, r_V); + Interp1d(data, r_U, s_B, r_V); WriteElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); } else if (BASIS_DIM == 2) { ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); - InterpTensor2d(data, r_U, c_B, r_V); + InterpTensor2d(data, r_U, s_B, r_V); WriteElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); } else if (BASIS_DIM == 3) { ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); - InterpTensor3d(data, r_U, c_B, r_V); + InterpTensor3d(data, r_U, s_B, r_V); WriteElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); } @@ -61,19 +67,25 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); - InterpTranspose1d(data, r_U, c_B, r_V); + InterpTranspose1d(data, r_U, s_B, r_V); WriteElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 2) { ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); - InterpTransposeTensor2d(data, r_U, c_B, r_V); + InterpTransposeTensor2d(data, r_U, s_B, r_V); WriteElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 3) { ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); - InterpTransposeTensor3d(data, r_U, c_B, r_V); + InterpTransposeTensor3d(data, r_U, s_B, r_V); WriteElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); } @@ -94,19 +106,25 @@ extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const Ceed CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); - InterpTranspose1d(data, r_U, c_B, r_V); + InterpTranspose1d(data, r_U, s_B, r_V); SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 2) { ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); - InterpTransposeTensor2d(data, r_U, c_B, r_V); + InterpTransposeTensor2d(data, r_U, s_B, r_V); SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 3) { ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); - InterpTransposeTensor3d(data, r_U, c_B, r_V); + InterpTransposeTensor3d(data, r_U, s_B, r_V); SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); } @@ -130,21 +148,29 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, c CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + // load interp_1d and grad_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); - Grad1d(data, r_U, c_B, c_G, r_V); + Grad1d(data, r_U, s_B, s_G, r_V); WriteElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); } else if (BASIS_DIM == 2) { ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); - GradTensor2d(data, r_U, c_B, c_G, r_V); + GradTensor2d(data, r_U, s_B, s_G, r_V); WriteElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); } else if (BASIS_DIM == 3) { ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); - if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d(data, r_U, c_B, c_G, r_V); - else GradTensor3d(data, r_U, c_B, c_G, r_V); + if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d(data, r_U, s_B, s_G, r_V); + else GradTensor3d(data, r_U, s_B, s_G, r_V); WriteElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); } @@ -165,21 +191,29 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + // load interp_1d and grad_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); - GradTranspose1d(data, r_U, c_B, c_G, r_V); + GradTranspose1d(data, r_U, s_B, s_G, r_V); WriteElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 2) { ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); - GradTransposeTensor2d(data, r_U, c_B, c_G, r_V); + GradTransposeTensor2d(data, r_U, s_B, s_G, r_V); WriteElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 3) { ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); - if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d(data, r_U, c_B, c_G, r_V); - else GradTransposeTensor3d(data, r_U, c_B, c_G, r_V); + if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d(data, r_U, s_B, s_G, r_V); + else GradTransposeTensor3d(data, r_U, s_B, s_G, r_V); WriteElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); } @@ -200,21 +234,29 @@ extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedSc CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + // load interp_1d and grad_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); - GradTranspose1d(data, r_U, c_B, c_G, r_V); + GradTranspose1d(data, r_U, s_B, s_G, r_V); SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 2) { ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); - GradTransposeTensor2d(data, r_U, c_B, c_G, r_V); + GradTransposeTensor2d(data, r_U, s_B, s_G, r_V); SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 3) { ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); - if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d(data, r_U, c_B, c_G, r_V); - else GradTransposeTensor3d(data, r_U, c_B, c_G, r_V); + if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d(data, r_U, s_B, s_G, r_V); + else GradTransposeTensor3d(data, r_U, s_B, s_G, r_V); SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); } @@ -236,6 +278,7 @@ extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__re CeedScalar r_W[BASIS_DIM > 2 ? BASIS_Q_1D : 1]; + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { Weight1d(data, q_weight_1d, r_W); diff --git a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h index a5313ec925..47b4eae92f 100644 --- a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h +++ b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h @@ -12,11 +12,9 @@ //------------------------------------------------------------------------------ // Helper function: load matrices for basis actions //------------------------------------------------------------------------------ -template -inline __device__ void loadMatrix(const CeedScalar *d_B, CeedScalar *B) { - CeedInt tid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; - - for (CeedInt i = tid; i < SIZE; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; +template +inline __device__ void LoadMatrix(SharedData_Hip &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { + for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; } //------------------------------------------------------------------------------ diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h index 9f5e947a07..753d5e1af7 100644 --- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h +++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h @@ -21,15 +21,10 @@ // Interp //------------------------------------------------------------------------------ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ - void InterpAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem, - const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + void InterpAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; - // load chebyshev_interp_1d into shared memory - __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; - loadMatrix(d_chebyshev_interp_1d, s_B); - __syncthreads(); - SharedData_Hip data; data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; @@ -42,6 +37,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP]; + // load chebyshev_interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { // Map to coefficients @@ -77,15 +77,10 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ } extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ - void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem, - const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; - // load chebyshev_interp_1d into shared memory - __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; - loadMatrix(d_chebyshev_interp_1d, s_B); - __syncthreads(); - SharedData_Hip data; data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; @@ -98,6 +93,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + // load chebyshev_interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { // Clear register @@ -140,15 +140,10 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ // Grad //------------------------------------------------------------------------------ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ - void GradAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem, - const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + void GradAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; - // load chebyshev_interp_1d into shared memory - __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; - loadMatrix(d_chebyshev_interp_1d, s_B); - __syncthreads(); - SharedData_Hip data; data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; @@ -161,6 +156,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; + // load chebyshev_interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { // Map to coefficients @@ -196,15 +196,10 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ } extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ - void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem, - const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; - // load chebyshev_interp_1d into shared memory - __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; - loadMatrix(d_chebyshev_interp_1d, s_B); - __syncthreads(); - SharedData_Hip data; data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; @@ -217,6 +212,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + // load chebyshev_interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { // Clear register diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h index d84f5555c8..fda6ee2cfe 100644 --- a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h +++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h @@ -16,14 +16,9 @@ // Interp kernel by dim //------------------------------------------------------------------------------ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ - void Interp(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; - // load interp_1d into shared memory - __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; - loadMatrix(d_interp_1d, s_B); - __syncthreads(); - SharedData_Hip data; data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; @@ -34,6 +29,12 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); @@ -54,14 +55,9 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ } extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ - void InterpTranspose(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; - // load interp_1d into shared memory - __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; - loadMatrix(d_interp_1d, s_B); - __syncthreads(); - SharedData_Hip data; data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; @@ -72,6 +68,12 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); @@ -92,14 +94,9 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ } extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ - void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; - // load interp_1d into shared memory - __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; - loadMatrix(d_interp_1d, s_B); - __syncthreads(); - SharedData_Hip data; data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; @@ -110,6 +107,12 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __syncthreads(); + + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); @@ -132,18 +135,10 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ //------------------------------------------------------------------------------ // Grad kernel by dim //------------------------------------------------------------------------------ -extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ - void Grad(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; - // load interp_1d and grad_1d into shared memory - __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; - loadMatrix(d_interp_1d, s_B); - __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; - loadMatrix(d_grad_1d, s_G); - __syncthreads(); - SharedData_Hip data; data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; @@ -154,6 +149,14 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + // load interp_1d and grad_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); @@ -176,17 +179,10 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ } extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ - void GradTranspose(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U, + void GradTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; - // load interp_1d and grad_1d into shared memory - __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; - loadMatrix(d_interp_1d, s_B); - __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; - loadMatrix(d_grad_1d, s_G); - __syncthreads(); - SharedData_Hip data; data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; @@ -197,6 +193,14 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + // load interp_1d and grad_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); @@ -219,17 +223,10 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ } extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ - void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U, + void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; - // load interp_1d and grad_1d into shared memory - __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; - loadMatrix(d_interp_1d, s_B); - __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; - loadMatrix(d_grad_1d, s_G); - __syncthreads(); - SharedData_Hip data; data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; @@ -240,6 +237,14 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + // load interp_1d and grad_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + LoadMatrix(data, c_B, s_B); + __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; + LoadMatrix(data, c_G, s_G); + __syncthreads(); + + // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);