From f5cd8712ccb474e8716657c2a5c6fe39e798853e Mon Sep 17 00:00:00 2001 From: Jeremy L Thompson Date: Mon, 12 Aug 2024 15:39:24 -0600 Subject: [PATCH] basis - add GPU ApplyAddAtPoints --- backends/cuda-ref/ceed-cuda-ref-basis.c | 22 +++++++++++++++---- backends/cuda-shared/ceed-cuda-shared-basis.c | 21 ++++++++++++++---- backends/hip-ref/ceed-hip-ref-basis.c | 22 +++++++++++++++---- backends/hip-shared/ceed-hip-shared-basis.c | 22 +++++++++++++++---- .../cuda/cuda-ref-basis-tensor-at-points.h | 6 +++-- .../hip/hip-ref-basis-tensor-at-points.h | 6 +++-- interface/ceed-basis.c | 1 + 7 files changed, 80 insertions(+), 20 deletions(-) diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c index 09637e2713..8cf285cbc8 100644 --- a/backends/cuda-ref/ceed-cuda-ref-basis.c +++ b/backends/cuda-ref/ceed-cuda-ref-basis.c @@ -100,8 +100,8 @@ static int CeedBasisApplyAdd_Cuda(CeedBasis basis, const CeedInt num_elem, CeedT //------------------------------------------------------------------------------ // Basis apply - tensor AtPoints //------------------------------------------------------------------------------ -static int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, - CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { +static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, + CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { Ceed ceed; CeedInt Q_1d, dim, max_num_points = num_points[0]; const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; @@ -172,10 +172,11 @@ static int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Clear v for transpose operation - if (is_transpose) { + if (is_transpose && !apply_add) { CeedSize length; CeedCallBackend(CeedVectorGetLength(v, &length)); @@ -214,6 +215,18 @@ static int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, return CEED_ERROR_SUCCESS; } +static int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAddAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Basis apply - non-tensor //------------------------------------------------------------------------------ @@ -403,6 +416,7 @@ int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda)); return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c index bccf4d4e39..4f0901484d 100644 --- a/backends/cuda-shared/ceed-cuda-shared-basis.c +++ b/backends/cuda-shared/ceed-cuda-shared-basis.c @@ -206,8 +206,8 @@ static int CeedBasisApplyAddTensor_Cuda_shared(CeedBasis basis, const CeedInt nu //------------------------------------------------------------------------------ // Basis apply - tensor AtPoints //------------------------------------------------------------------------------ -static int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, - CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { +static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, + CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { Ceed ceed; CeedInt Q_1d, dim, max_num_points = num_points[0]; const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; @@ -278,10 +278,11 @@ static int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Clear v for transpose operation - if (is_transpose) { + if (is_transpose && !apply_add) { CeedSize length; CeedCallBackend(CeedVectorGetLength(v, &length)); @@ -320,6 +321,18 @@ static int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num return CEED_ERROR_SUCCESS; } +static int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAddAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Destroy basis //------------------------------------------------------------------------------ diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c index 633a41adff..78e27a1a5f 100644 --- a/backends/hip-ref/ceed-hip-ref-basis.c +++ b/backends/hip-ref/ceed-hip-ref-basis.c @@ -98,8 +98,8 @@ static int CeedBasisApplyAdd_Hip(CeedBasis basis, const CeedInt num_elem, CeedTr //------------------------------------------------------------------------------ // Basis apply - tensor AtPoints //------------------------------------------------------------------------------ -static int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, - CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { +static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, + CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { Ceed ceed; CeedInt Q_1d, dim, max_num_points = num_points[0]; const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; @@ -170,10 +170,11 @@ static int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, c CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Clear v for transpose operation - if (is_transpose) { + if (is_transpose && !apply_add) { CeedSize length; CeedCallBackend(CeedVectorGetLength(v, &length)); @@ -212,6 +213,18 @@ static int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, c return CEED_ERROR_SUCCESS; } +static int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Hip(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAddAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, + CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Hip(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Basis apply - non-tensor //------------------------------------------------------------------------------ @@ -401,6 +414,7 @@ int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c index 29f4cf54b0..f88a20e6d1 100644 --- a/backends/hip-shared/ceed-hip-shared-basis.c +++ b/backends/hip-shared/ceed-hip-shared-basis.c @@ -265,8 +265,8 @@ int CeedBasisApplyAddTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, //------------------------------------------------------------------------------ // Basis apply - tensor AtPoints //------------------------------------------------------------------------------ -static int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, - CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { +static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, + CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { Ceed ceed; CeedInt Q_1d, dim, max_num_points = num_points[0]; const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; @@ -337,10 +337,11 @@ static int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_ CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); + else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Clear v for transpose operation - if (is_transpose) { + if (is_transpose && !apply_add) { CeedSize length; CeedCallBackend(CeedVectorGetLength(v, &length)); @@ -379,6 +380,18 @@ static int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_ return CEED_ERROR_SUCCESS; } +static int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Hip_shared(basis, false, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + +static int CeedBasisApplyAddAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, + CeedVector u, CeedVector v) { + CeedCallBackend(CeedBasisApplyAtPointsCore_Hip_shared(basis, true, num_elem, t_mode, eval_mode, u, v)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Destroy basis //------------------------------------------------------------------------------ @@ -469,6 +482,7 @@ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Hip_shared)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Hip_shared)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Hip_shared)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip_shared)); return CEED_ERROR_SUCCESS; } diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h index fc65a10912..01d144e669 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h +++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h @@ -124,7 +124,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt CeedScalar v_k = 0; for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; - out[k] = v_k; + if (d == BASIS_DIM - 1) out[k] += v_k; + else out[k] = v_k; } post *= P; } @@ -283,7 +284,8 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is CeedScalar v_k = 0; for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; - out[k] = v_k; + if (d == BASIS_DIM - 1) out[k] += v_k; + else out[k] = v_k; } post *= P; } diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h index fc65a10912..01d144e669 100644 --- a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h +++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h @@ -124,7 +124,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt CeedScalar v_k = 0; for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; - out[k] = v_k; + if (d == BASIS_DIM - 1) out[k] += v_k; + else out[k] = v_k; } post *= P; } @@ -283,7 +284,8 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is CeedScalar v_k = 0; for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; - out[k] = v_k; + if (d == BASIS_DIM - 1) out[k] += v_k; + else out[k] = v_k; } post *= P; } diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c index 23e05781eb..b7bbf67b87 100644 --- a/interface/ceed-basis.c +++ b/interface/ceed-basis.c @@ -1950,6 +1950,7 @@ int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num **/ int CeedBasisApplyAddAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { + CeedCheck(t_mode == CEED_TRANSPOSE, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "CeedBasisApplyAddAtPoints only supports CEED_TRANSPOSE"); CeedCall(CeedBasisApplyAtPointsCheckDims(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); if (basis->ApplyAddAtPoints) { CeedCall(basis->ApplyAddAtPoints(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));