diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c index 118d156a84..bccf4d4e39 100644 --- a/backends/cuda-shared/ceed-cuda-shared-basis.c +++ b/backends/cuda-shared/ceed-cuda-shared-basis.c @@ -27,8 +27,8 @@ int CeedInit_CudaCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, //------------------------------------------------------------------------------ // Apply basis //------------------------------------------------------------------------------ -int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, - CeedVector v) { +static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector u, CeedVector v) { Ceed ceed; Ceed_Cuda *ceed_Cuda; CeedInt dim, num_comp; @@ -45,7 +45,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce // Get read/write access to u, v if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Apply basis operation switch (eval_mode) { @@ -66,7 +67,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, 1, + elems_per_block, shared_mem, interp_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); } @@ -78,8 +80,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend( - CeedRunKernelDimShared_Cuda(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d, + elems_per_block, shared_mem, interp_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); } @@ -89,8 +91,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend( - CeedRunKernelDimShared_Cuda(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d, + elems_per_block, shared_mem, interp_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); } @@ -116,7 +118,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, 1, + elems_per_block, shared_mem, grad_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); } @@ -128,7 +131,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d, + elems_per_block, shared_mem, grad_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); } @@ -138,7 +142,8 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d, + elems_per_block, shared_mem, grad_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); } @@ -186,11 +191,23 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce return CEED_ERROR_SUCCESS; } +static int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, false, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAddTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, + CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, true, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Basis apply - tensor AtPoints //------------------------------------------------------------------------------ -int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, - CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { +static int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { Ceed ceed; CeedInt Q_1d, dim, max_num_points = num_points[0]; const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; @@ -374,8 +391,10 @@ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_HAS_COLLOCATED_GRAD", has_collocated_grad)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTranspose", &data->GradTranspose)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); CeedCallBackend(CeedFree(&basis_kernel_path)); CeedCallBackend(CeedFree(&basis_kernel_source)); @@ -384,6 +403,7 @@ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Cuda_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Cuda_shared)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda_shared)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared)); return CEED_ERROR_SUCCESS; diff --git a/backends/cuda-shared/ceed-cuda-shared.h b/backends/cuda-shared/ceed-cuda-shared.h index 40bbaed6a0..96800a0a86 100644 --- a/backends/cuda-shared/ceed-cuda-shared.h +++ b/backends/cuda-shared/ceed-cuda-shared.h @@ -14,8 +14,10 @@ typedef struct { CUmodule module; CUfunction Interp; CUfunction InterpTranspose; + CUfunction InterpTransposeAdd; CUfunction Grad; CUfunction GradTranspose; + CUfunction GradTransposeAdd; CUfunction Weight; CUmodule moduleAtPoints; CeedInt num_points; diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c index ff41f9efe6..29f4cf54b0 100644 --- a/backends/hip-shared/ceed-hip-shared-basis.c +++ b/backends/hip-shared/ceed-hip-shared-basis.c @@ -87,8 +87,8 @@ static int ComputeBasisThreadBlockSizes(const CeedInt dim, const CeedInt P_1d, c //------------------------------------------------------------------------------ // Apply basis //------------------------------------------------------------------------------ -int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, - CeedVector v) { +static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector u, CeedVector v) { Ceed ceed; Ceed_Hip *ceed_Hip; CeedInt dim, num_comp; @@ -105,7 +105,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee // Get read/write access to u, v if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Apply basis operation switch (eval_mode) { @@ -125,7 +126,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->InterpTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, 1, + elems_per_block, shared_mem, interp_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); } @@ -136,8 +138,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend( - CeedRunKernelDimShared_Hip(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d, + elems_per_block, shared_mem, interp_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); } @@ -147,8 +149,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend( - CeedRunKernelDimShared_Hip(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d, + elems_per_block, shared_mem, interp_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); } @@ -174,7 +176,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, 1, + elems_per_block, shared_mem, grad_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); } @@ -185,7 +188,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d, + elems_per_block, shared_mem, grad_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); } @@ -195,7 +199,8 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d, + elems_per_block, shared_mem, grad_args)); } else { CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); } @@ -245,11 +250,23 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee return CEED_ERROR_SUCCESS; } +int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyTensorCore_Hip_shared(basis, false, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +int CeedBasisApplyAddTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyTensorCore_Hip_shared(basis, true, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Basis apply - tensor AtPoints //------------------------------------------------------------------------------ -int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, - CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { +static int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { Ceed ceed; CeedInt Q_1d, dim, max_num_points = num_points[0]; const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; @@ -438,8 +455,10 @@ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, has_collocated_grad)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTranspose", &data->GradTranspose)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); CeedCallBackend(CeedFree(&basis_kernel_path)); CeedCallBackend(CeedFree(&basis_kernel_source)); @@ -448,6 +467,7 @@ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Hip_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Hip_shared)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip_shared)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip_shared)); return CEED_ERROR_SUCCESS; diff --git a/backends/hip-shared/ceed-hip-shared.h b/backends/hip-shared/ceed-hip-shared.h index 8bc9d041a2..fe3384f55d 100644 --- a/backends/hip-shared/ceed-hip-shared.h +++ b/backends/hip-shared/ceed-hip-shared.h @@ -14,8 +14,10 @@ typedef struct { hipModule_t module; hipFunction_t Interp; hipFunction_t InterpTranspose; + hipFunction_t InterpTransposeAdd; hipFunction_t Grad; hipFunction_t GradTranspose; + hipFunction_t GradTransposeAdd; hipFunction_t Weight; hipModule_t moduleAtPoints; CeedInt num_points; diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h index b10ba108f8..56234c28e4 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h @@ -46,6 +46,19 @@ inline __device__ void WriteElementStrided1d(SharedData_Cuda &data, const CeedIn } } +template +inline __device__ void SumElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + if (data.t_id_x < P_1D) { + const CeedInt node = data.t_id_x; + const CeedInt ind = node * strides_node + elem * strides_elem; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_v[ind + comp * strides_comp] += r_v[comp]; + } + } +} + //------------------------------------------------------------------------------ // 2D //------------------------------------------------------------------------------ @@ -82,6 +95,19 @@ inline __device__ void WriteElementStrided2d(SharedData_Cuda &data, const CeedIn } } +template +inline __device__ void SumElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D; + const CeedInt ind = node * strides_node + elem * strides_elem; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_v[ind + comp * strides_comp] += r_v[comp]; + } + } +} + //------------------------------------------------------------------------------ // 3D //------------------------------------------------------------------------------ @@ -121,3 +147,18 @@ inline __device__ void WriteElementStrided3d(SharedData_Cuda &data, const CeedIn } } } + +template +inline __device__ void SumElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + for (CeedInt z = 0; z < P_1D; z++) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; + const CeedInt ind = node * strides_node + elem * strides_elem; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_v[ind + comp * strides_comp] += r_v[z + comp * P_1D]; + } + } + } +} diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h index d6039d3a33..c295362978 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h @@ -81,6 +81,39 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca } } +extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + InterpTranspose1d(data, r_U, c_B, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + InterpTransposeTensor2d(data, r_U, c_B, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + InterpTransposeTensor3d(data, r_U, c_B, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} + //------------------------------------------------------------------------------ // Grad kernel by dim //------------------------------------------------------------------------------ @@ -154,6 +187,41 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala } } +extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + GradTranspose1d(data, r_U, c_B, c_G, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, + r_U); + GradTransposeTensor2d(data, r_U, c_B, c_G, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d(data, r_U, c_B, c_G, r_V); + else GradTransposeTensor3d(data, r_U, c_B, c_G, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} + //------------------------------------------------------------------------------ // Weight kernels by dim //------------------------------------------------------------------------------ diff --git a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h index a6d945ac56..379d52d13b 100644 --- a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h +++ b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h @@ -56,6 +56,19 @@ inline __device__ void WriteElementStrided1d(SharedData_Hip &data, const CeedInt } } +template +inline __device__ void SumElementStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + if (data.t_id_x < P_1D) { + const CeedInt node = data.t_id_x; + const CeedInt ind = node * strides_node + elem * strides_elem; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_v[ind + comp * strides_comp] += r_v[comp]; + } + } +} + //------------------------------------------------------------------------------ // 2D //------------------------------------------------------------------------------ @@ -92,6 +105,19 @@ inline __device__ void WriteElementStrided2d(SharedData_Hip &data, const CeedInt } } +template +inline __device__ void SumElementStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D; + const CeedInt ind = node * strides_node + elem * strides_elem; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_v[ind + comp * strides_comp] += r_v[comp]; + } + } +} + //------------------------------------------------------------------------------ // 3D //------------------------------------------------------------------------------ @@ -131,3 +157,18 @@ inline __device__ void WriteElementStrided3d(SharedData_Hip &data, const CeedInt } } } + +template +inline __device__ void SumElementStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + if (data.t_id_x < P_1D && data.t_id_y < P_1D) { + for (CeedInt z = 0; z < P_1D; z++) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; + const CeedInt ind = node * strides_node + elem * strides_elem; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_v[ind + comp * strides_comp] += r_v[z + comp * P_1D]; + } + } + } +} diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h index 0a9a1f3cee..aef40d8a43 100644 --- a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h +++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h @@ -92,6 +92,44 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ } } +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + loadMatrix(d_interp_1d, s_B); + __syncthreads(); + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + InterpTranspose1d(data, r_U, s_B, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + InterpTransposeTensor2d(data, r_U, s_B, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + InterpTransposeTensor3d(data, r_U, s_B, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} + //------------------------------------------------------------------------------ // Grad kernel by dim //------------------------------------------------------------------------------ @@ -181,6 +219,49 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ } } +extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ + void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + // load interp_1d and grad_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + loadMatrix(d_interp_1d, s_B); + __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; + loadMatrix(d_grad_1d, s_G); + __syncthreads(); + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); + GradTranspose1d(data, r_U, s_B, s_G, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, + r_U); + GradTransposeTensor2d(data, r_U, s_B, s_G, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); + if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d(data, r_U, s_B, s_G, r_V); + else GradTransposeTensor3d(data, r_U, s_B, s_G, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} + //------------------------------------------------------------------------------ // Weight kernels by dim //------------------------------------------------------------------------------