Skip to content

Commit

Permalink
gpu - use gen LoadMatrix in shared
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremylt committed Jan 6, 2025
1 parent 0ff6123 commit 324c97e
Show file tree
Hide file tree
Showing 11 changed files with 241 additions and 191 deletions.
34 changes: 13 additions & 21 deletions backends/cuda-shared/ceed-cuda-shared-basis.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,6 @@
#include "../cuda/ceed-cuda-compile.h"
#include "ceed-cuda-shared.h"

//------------------------------------------------------------------------------
// Device initalization
//------------------------------------------------------------------------------
int CeedInit_CudaInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B);
int CeedInit_CudaGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr);
int CeedInit_CudaCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr);

//------------------------------------------------------------------------------
// Apply tensor basis
//------------------------------------------------------------------------------
Expand Down Expand Up @@ -58,8 +51,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);

CeedCallBackend(CeedInit_CudaInterp(data->d_interp_1d, P_1d, Q_1d, &data->c_B));
void *interp_args[] = {(void *)&num_elem, &data->c_B, &d_u, &d_v};
void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v};

if (dim == 1) {
// avoid >512 total threads
Expand Down Expand Up @@ -104,14 +96,14 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,

CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
CeedScalar *d_grad_1d = data->d_grad_1d;

if (data->d_collo_grad_1d) {
CeedCallBackend(CeedInit_CudaCollocatedGrad(data->d_interp_1d, data->d_collo_grad_1d, P_1d, Q_1d, &data->c_B, &data->c_G));
} else {
CeedCallBackend(CeedInit_CudaGrad(data->d_interp_1d, data->d_grad_1d, P_1d, Q_1d, &data->c_B, &data->c_G));
d_grad_1d = data->d_collo_grad_1d;
}
void *grad_args[] = {(void *)&num_elem, &data->c_B, &data->c_G, &d_u, &d_v};
void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_grad_1d, &d_u, &d_v};

if (dim == 1) {
// avoid >512 total threads
CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));
Expand Down Expand Up @@ -328,8 +320,7 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);

CeedCallBackend(CeedInit_CudaInterp(data->d_chebyshev_interp_1d, P_1d, Q_1d, &data->c_B));
void *interp_args[] = {(void *)&num_elem, &data->c_B, &data->d_points_per_elem, &d_x, &d_u, &d_v};
void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};

if (dim == 1) {
// avoid >512 total threads
Expand Down Expand Up @@ -364,7 +355,6 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);

CeedCallBackend(CeedInit_CudaInterp(data->d_chebyshev_interp_1d, P_1d, Q_1d, &data->c_B));
void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};

if (dim == 1) {
Expand Down Expand Up @@ -456,8 +446,7 @@ static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_a
CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
CeedInt thread = CeedIntMax(Q, P);

CeedCallBackend(CeedInit_CudaInterp(data->d_interp_1d, P, Q, &data->c_B));
void *interp_args[] = {(void *)&num_elem, &data->c_B, &d_u, &d_v};
void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v};

{
// avoid >512 total threads
Expand All @@ -480,8 +469,7 @@ static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_a
CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
CeedInt thread = CeedIntMax(Q, P);

CeedCallBackend(CeedInit_CudaInterp(data->d_grad_1d, P, Q * dim, &data->c_G));
void *grad_args[] = {(void *)&num_elem, &data->c_G, &d_u, &d_v};
void *grad_args[] = {(void *)&num_elem, &data->d_grad_1d, &d_u, &d_v};

{
// avoid >512 total threads
Expand Down Expand Up @@ -641,6 +629,10 @@ int CeedBasisCreateH1_Cuda_shared(CeedElemTopology topo, CeedInt dim, CeedInt nu
CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
CeedCallBackend(CeedCalloc(1, &data));

// Check max sizes
CeedCheck(dim <= 3, ceed, CEED_ERROR_BACKEND, "Backend does not implement nontensor bases with dim > 3");
CeedCheck(num_nodes * num_qpts * dim < 52 * 52 * 3, ceed, CEED_ERROR_BACKEND, "Backend does not implement nontensor bases with P * Q this large");

// Copy basis data to GPU
CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
Expand Down
53 changes: 0 additions & 53 deletions backends/cuda-shared/kernels/cuda-shared-basis.cu

This file was deleted.

1 change: 1 addition & 0 deletions backends/hip-shared/ceed-hip-shared-basis.c
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
d_grad_1d = data->d_collo_grad_1d;
}
void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_grad_1d, &d_u, &d_v};

if (dim == 1) {
CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
Expand Down
48 changes: 42 additions & 6 deletions include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,15 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B,
CeedScalar r_U[BASIS_NUM_COMP];
CeedScalar r_V[BASIS_NUM_COMP];

// load interp into shared memory
__shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
__syncthreads();

// Apply basis element by element
for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
InterpNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, c_B, r_V);
InterpNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V);
WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
}
}
Expand All @@ -49,9 +55,15 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca
CeedScalar r_U[BASIS_NUM_COMP];
CeedScalar r_V[BASIS_NUM_COMP];

// load interp into shared memory
__shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
__syncthreads();

// Apply basis element by element
for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, c_B, r_V);
InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V);
WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
}
}
Expand All @@ -70,9 +82,15 @@ extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const Ceed
CeedScalar r_U[BASIS_NUM_COMP];
CeedScalar r_V[BASIS_NUM_COMP];

// load interp into shared memory
__shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
__syncthreads();

// Apply basis element by element
for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, c_B, r_V);
InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V);
SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
}
}
Expand All @@ -93,9 +111,15 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, c
CeedScalar r_U[BASIS_NUM_COMP];
CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];

// load grad into shared memory
__shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
__syncthreads();

// Apply basis element by element
for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
GradNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, c_G, r_V);
GradNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V);
WriteElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
}
}
Expand All @@ -114,9 +138,15 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala
CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
CeedScalar r_V[BASIS_NUM_COMP];

// load grad into shared memory
__shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
__syncthreads();

// Apply basis element by element
for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, c_G, r_V);
GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V);
WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
}
}
Expand All @@ -135,9 +165,15 @@ extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedSc
CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
CeedScalar r_V[BASIS_NUM_COMP];

// load grad into shared memory
__shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
__syncthreads();

// Apply basis element by element
for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, c_G, r_V);
GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V);
SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@
/// Internal header for CUDA shared memory basis read/write templates
#include <ceed/types.h>

//------------------------------------------------------------------------------
// Load matrices for basis actions
//------------------------------------------------------------------------------
template <int P, int Q>
inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
}

//------------------------------------------------------------------------------
// 1D
//------------------------------------------------------------------------------
Expand Down
Loading

0 comments on commit 324c97e

Please sign in to comment.