diff --git a/Makefile b/Makefile index fbb9ba415c..7c9174a6cb 100644 --- a/Makefile +++ b/Makefile @@ -275,8 +275,6 @@ cuda-gen.cu := $(sort $(wildcard backends/cuda-gen/kernels/*.cu)) occa.cpp := $(sort $(shell find backends/occa -type f -name *.cpp)) magma.c := $(sort $(wildcard backends/magma/*.c)) magma.cpp := $(sort $(wildcard backends/magma/*.cpp)) -magma.cu := $(sort $(wildcard backends/magma/kernels/cuda/*.cu)) -magma.hip := $(sort $(wildcard backends/magma/kernels/hip/*.hip.cpp)) hip.c := $(sort $(wildcard backends/hip/*.c)) hip.cpp := $(sort $(wildcard backends/hip/*.cpp)) hip-ref.c := $(sort $(wildcard backends/hip-ref/*.c)) @@ -491,10 +489,8 @@ ifneq ($(wildcard $(MAGMA_DIR)/lib/libmagma.*),) PKG_LIBS += $(magma_link) libceed.c += $(magma.c) libceed.cpp += $(magma.cpp) - libceed.cu += $(magma.cu) $(magma.c:%.c=$(OBJDIR)/%.o) $(magma.c:%=%.tidy) : CPPFLAGS += -DADD_ -I$(MAGMA_DIR)/include -I$(CUDA_DIR)/include $(magma.cpp:%.cpp=$(OBJDIR)/%.o) $(magma.cpp:%=%.tidy) : CPPFLAGS += -DADD_ -I$(MAGMA_DIR)/include -I$(CUDA_DIR)/include - $(magma.cu:%.cu=$(OBJDIR)/%.o) : CPPFLAGS += --compiler-options=-fPIC -DADD_ -I$(MAGMA_DIR)/include -I$(MAGMA_DIR)/magmablas -I$(CUDA_DIR)/include MAGMA_BACKENDS = /gpu/cuda/magma /gpu/cuda/magma/det endif else # HIP MAGMA @@ -507,10 +503,8 @@ ifneq ($(wildcard $(MAGMA_DIR)/lib/libmagma.*),) PKG_LIBS += $(magma_link) libceed.c += $(magma.c) libceed.cpp += $(magma.cpp) - libceed.hip += $(magma.hip) $(magma.c:%.c=$(OBJDIR)/%.o) $(magma.c:%=%.tidy) : CPPFLAGS += $(HIPCONFIG_CPPFLAGS) -I$(MAGMA_DIR)/include -I$(ROCM_DIR)/include -DCEED_MAGMA_USE_HIP -DADD_ $(magma.cpp:%.cpp=$(OBJDIR)/%.o) $(magma.cpp:%=%.tidy) : CPPFLAGS += $(HIPCONFIG_CPPFLAGS) -I$(MAGMA_DIR)/include -I$(ROCM_DIR)/include -DCEED_MAGMA_USE_HIP -DADD_ - $(magma.hip:%.hip.cpp=$(OBJDIR)/%.o) : CPPFLAGS += -I$(MAGMA_DIR)/include -I$(MAGMA_DIR)/magmablas -I$(ROCM_DIR)/include -DCEED_MAGMA_USE_HIP -DADD_ MAGMA_BACKENDS = /gpu/hip/magma /gpu/hip/magma/det endif endif diff --git a/backends/magma/ceed-magma-basis.c b/backends/magma/ceed-magma-basis.c index 3cb651f274..7023544b1b 100644 --- a/backends/magma/ceed-magma-basis.c +++ b/backends/magma/ceed-magma-basis.c @@ -20,57 +20,56 @@ #include "ceed-magma-common.h" #include "ceed-magma.h" -#ifdef __cplusplus -CEED_INTERN "C" -#endif - int - CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector U, CeedVector V) { +#include "ceed-magma-gemm-nontensor.h" +#include "ceed-magma-gemm-selector.h" + +//------------------------------------------------------------------------------ +// Basis apply - tensor +//------------------------------------------------------------------------------ +static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, CeedVector v) { Ceed ceed; Ceed_Magma *data; - CeedInt dim, num_comp, num_dof, P_1d, Q_1d; - const CeedScalar *du; - CeedScalar *dv; + CeedInt dim, num_comp, num_nodes, P_1d, Q_1d, P, Q; + const CeedScalar *d_u; + CeedScalar *d_v; CeedBasis_Magma *impl; CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - CeedCallBackend(CeedBasisGetDimension(basis, &dim)); - CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); - CeedCallBackend(CeedBasisGetNumNodes(basis, &num_dof)); - CeedCallBackend(CeedGetData(ceed, &data)); - - if (U != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du)); - else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv)); - CeedCallBackend(CeedBasisGetData(basis, &impl)); - + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + P = P_1d; + Q = Q_1d; + if (t_mode == CEED_TRANSPOSE) { + P = Q_1d; + Q = P_1d; + } - CeedDebug256(ceed, 4, "[CeedBasisApply_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, num_comp * CeedIntPow(P_1d, dim), num_comp); + // Read vectors + if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); + else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + // Clear v for transpose operation if (t_mode == CEED_TRANSPOSE) { CeedSize length; - CeedCallBackend(CeedVectorGetLength(V, &length)); + CeedCallBackend(CeedVectorGetLength(v, &length)); if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { - magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)dv, length, data->queue); + magmablas_slaset(MagmaFull, length, 1, 0.0, 0.0, (float *)d_v, length, data->queue); } else { - magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)dv, length, data->queue); + magmablas_dlaset(MagmaFull, length, 1, 0.0, 0.0, (double *)d_v, length, data->queue); } ceed_magma_queue_sync(data->queue); } + // Apply basis operation switch (e_mode) { case CEED_EVAL_INTERP: { - CeedInt P = P_1d, Q = Q_1d; - - if (t_mode == CEED_TRANSPOSE) { - P = Q_1d; - Q = P_1d; - } - // Define element sizes for dofs/quad CeedInt elem_qpts_size = CeedIntPow(Q_1d, dim); CeedInt elem_dofs_size = CeedIntPow(P_1d, dim); @@ -81,7 +80,7 @@ CEED_INTERN "C" // node node // --- Define strides for NOTRANSPOSE mode: --- - // Input (du) is E-vector, output (dv) is Q-vector + // Input (d_u) is E-vector, output (d_v) is Q-vector // Element strides CeedInt u_elem_stride = elem_dofs_size; @@ -89,10 +88,8 @@ CEED_INTERN "C" // Component strides CeedInt u_comp_stride = num_elem * elem_dofs_size; CeedInt v_comp_stride = num_elem * elem_qpts_size; - - // --- Swap strides for TRANSPOSE mode: --- if (t_mode == CEED_TRANSPOSE) { - // Input (du) is Q-vector, output (dv) is E-vector + // Input (d_u) is Q-vector, output (d_v) is E-vector // Element strides v_elem_stride = elem_dofs_size; u_elem_stride = elem_qpts_size; @@ -115,42 +112,37 @@ CEED_INTERN "C" case 2: num_threads = max_P_Q; num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D); - shared_mem += P * Q * sizeof(CeedScalar); // for sT - shared_mem += num_t_col * (P * max_P_Q * sizeof(CeedScalar)); // for reforming rU we need PxP, and for the intermediate output we need PxQ + shared_mem += P * Q * sizeof(CeedScalar); // for sT + // for reforming rU we need P x P, and for the intermediate output we need P x Q + shared_mem += num_t_col * (P * max_P_Q * sizeof(CeedScalar)); break; case 3: num_threads = max_P_Q * max_P_Q; num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D); shared_mem += sizeof(CeedScalar) * (P * Q); // for sT - shared_mem += sizeof(CeedScalar) * num_t_col * - (CeedIntMax(P * P * max_P_Q, - P * Q * Q)); // rU needs P^2xP, the intermediate output needs max(P^2xQ,PQ^2) + // rU needs P^2 x P, the intermediate output needs max(P^2 x Q, P x Q^2) + shared_mem += sizeof(CeedScalar) * num_t_col * (CeedIntMax(P * P * max_P_Q, P * Q * Q)); + break; } - CeedInt grid = (num_elem + num_t_col - 1) / num_t_col; - void *args[] = {&impl->d_interp_1d, &du, &u_elem_stride, &u_comp_stride, &dv, &v_elem_stride, &v_comp_stride, &num_elem}; + CeedInt grid = CeedDivUpInt(num_elem, num_t_col); + void *args[] = {&impl->d_interp_1d, &d_u, &u_elem_stride, &u_comp_stride, &d_v, &v_elem_stride, &v_comp_stride, &num_elem}; if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp_tr, grid, num_threads, num_t_col, 1, shared_mem, args)); + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->InterpTranspose, grid, num_threads, num_t_col, 1, shared_mem, args)); } else { - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp, grid, num_threads, num_t_col, 1, shared_mem, args)); + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Interp, grid, num_threads, num_t_col, 1, shared_mem, args)); } } break; case CEED_EVAL_GRAD: { - CeedInt P = P_1d, Q = Q_1d; - - // In CEED_NOTRANSPOSE mode: - // du is (P^dim x nc), column-major layout (nc = num_comp) - // dv is (Q^dim x nc x dim), column-major layout (nc = num_comp) - // In CEED_TRANSPOSE mode, the sizes of du and dv are switched. - if (t_mode == CEED_TRANSPOSE) { - P = Q_1d; - Q = P_1d; - } - // Define element sizes for dofs/quad CeedInt elem_qpts_size = CeedIntPow(Q_1d, dim); CeedInt elem_dofs_size = CeedIntPow(P_1d, dim); + // In CEED_NOTRANSPOSE mode: + // d_u is (P^dim x nc), column-major layout (nc = num_comp) + // d_v is (Q^dim x nc x dim), column-major layout (nc = num_comp) + // In CEED_TRANSPOSE mode, the sizes of d_u and d_v are switched. + // E-vector ordering -------------- Q-vector ordering // dim // component component @@ -158,7 +150,7 @@ CEED_INTERN "C" // node node // --- Define strides for NOTRANSPOSE mode: --- - // Input (du) is E-vector, output (dv) is Q-vector + // Input (d_u) is E-vector, output (d_v) is Q-vector // Element strides CeedInt u_elem_stride = elem_dofs_size; @@ -169,10 +161,8 @@ CEED_INTERN "C" // Dimension strides CeedInt u_dim_stride = 0; CeedInt v_dim_stride = num_elem * elem_qpts_size * num_comp; - - // --- Swap strides for TRANSPOSE mode: --- if (t_mode == CEED_TRANSPOSE) { - // Input (du) is Q-vector, output (dv) is E-vector + // Input (d_u) is Q-vector, output (d_v) is E-vector // Element strides v_elem_stride = elem_dofs_size; u_elem_stride = elem_qpts_size; @@ -198,30 +188,30 @@ CEED_INTERN "C" case 2: num_threads = max_P_Q; num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D); - shared_mem += sizeof(CeedScalar) * 2 * P * Q; // for sTinterp and sTgrad - shared_mem += sizeof(CeedScalar) * num_t_col * (P * max_P_Q); // for reforming rU we need PxP, and for the intermediate output we need PxQ + shared_mem += sizeof(CeedScalar) * 2 * P * Q; // for sTinterp and sTgrad + // for reforming rU we need P x P, and for the intermediate output we need P x Q + shared_mem += sizeof(CeedScalar) * num_t_col * (P * max_P_Q); break; case 3: num_threads = max_P_Q * max_P_Q; num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D); shared_mem += sizeof(CeedScalar) * 2 * P * Q; // for sTinterp and sTgrad - shared_mem += sizeof(CeedScalar) * num_t_col * - CeedIntMax(P * P * P, - (P * P * Q) + (P * Q * Q)); // rU needs P^2xP, the intermediate outputs need (P^2.Q + P.Q^2) + // rU needs P^2 x P, the intermediate outputs need (P^2 x Q + P x Q^2) + shared_mem += sizeof(CeedScalar) * num_t_col * CeedIntMax(P * P * P, (P * P * Q) + (P * Q * Q)); + break; } - CeedInt grid = (num_elem + num_t_col - 1) / num_t_col; - void *args[] = {&impl->d_interp_1d, &impl->d_grad_1d, &du, &u_elem_stride, &u_comp_stride, &u_dim_stride, &dv, + CeedInt grid = CeedDivUpInt(num_elem, num_t_col); + void *args[] = {&impl->d_interp_1d, &impl->d_grad_1d, &d_u, &u_elem_stride, &u_comp_stride, &u_dim_stride, &d_v, &v_elem_stride, &v_comp_stride, &v_dim_stride, &num_elem}; if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad_tr, grid, num_threads, num_t_col, 1, shared_mem, args)); + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->GradTranspose, grid, num_threads, num_t_col, 1, shared_mem, args)); } else { - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad, grid, num_threads, num_t_col, 1, shared_mem, args)); + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Grad, grid, num_threads, num_t_col, 1, shared_mem, args)); } } break; case CEED_EVAL_WEIGHT: { - CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT inum_compatible with CEED_TRANSPOSE"); - CeedInt Q = Q_1d; + CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); CeedInt elem_dofs_size = CeedIntPow(Q, dim); CeedInt num_threads = 1; CeedInt num_t_col = 1; @@ -243,11 +233,12 @@ CEED_INTERN "C" num_threads = Q * Q; num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D); shared_mem += sizeof(CeedScalar) * Q; // for d_q_weight_1d + break; } - CeedInt grid = (num_elem + num_t_col - 1) / num_t_col; - void *args[] = {&impl->d_q_weight_1d, &dv, &elem_dofs_size, &num_elem}; + CeedInt grid = CeedDivUpInt(num_elem, num_t_col); + void *args[] = {&impl->d_q_weight_1d, &d_v, &elem_dofs_size, &num_elem}; - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_weight, grid, num_threads, num_t_col, 1, shared_mem, args)); + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, grid, num_threads, num_t_col, 1, shared_mem, args)); } break; // LCOV_EXCL_START case CEED_EVAL_DIV: @@ -259,400 +250,370 @@ CEED_INTERN "C" // LCOV_EXCL_STOP } - // must sync to ensure completeness + // Must sync to ensure completeness ceed_magma_queue_sync(data->queue); + // Restore vectors if (e_mode != CEED_EVAL_WEIGHT) { - CeedCallBackend(CeedVectorRestoreArrayRead(U, &du)); + CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); } - CeedCallBackend(CeedVectorRestoreArray(V, &dv)); + CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); return CEED_ERROR_SUCCESS; } -#ifdef __cplusplus -CEED_INTERN "C" -#endif - int - CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector U, CeedVector V) { +//------------------------------------------------------------------------------ +// Basis apply - non-tensor +//------------------------------------------------------------------------------ +static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, + CeedVector v) { Ceed ceed; Ceed_Magma *data; - CeedInt dim, num_comp, num_dof, num_qpts, NB = 1; - const CeedScalar *du; - CeedScalar *dv; + CeedInt dim, num_comp, num_nodes, num_qpts, P, Q, N; + const CeedScalar *d_u; + CeedScalar *d_v; CeedBasisNonTensor_Magma *impl; - CeedMagmaFunction *interp, *grad; CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedGetData(ceed, &data)); - magma_int_t arch = magma_getdevice_arch(); - + CeedCallBackend(CeedBasisGetData(basis, &impl)); CeedCallBackend(CeedBasisGetDimension(basis, &dim)); CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); - CeedCallBackend(CeedBasisGetNumNodes(basis, &num_dof)); + CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); - CeedInt P = num_dof, Q = num_qpts, N = num_elem * num_comp; + P = num_nodes; + Q = num_qpts; + N = num_elem * num_comp; - if (U != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du)); + // Read vectors + if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); - CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv)); - - CeedCallBackend(CeedBasisGetData(basis, &impl)); - - CeedDebug256(ceed, 4, "[CeedBasisApplyNonTensor_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, num_comp * num_dof, num_comp); + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + // Clear v for transpose operation if (t_mode == CEED_TRANSPOSE) { CeedSize length; - CeedCallBackend(CeedVectorGetLength(V, &length)); + CeedCallBackend(CeedVectorGetLength(v, &length)); if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { - magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)dv, length, data->queue); + magmablas_slaset(MagmaFull, length, 1, 0.0, 0.0, (float *)d_v, length, data->queue); } else { - magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)dv, length, data->queue); + magmablas_dlaset(MagmaFull, length, 1, 0.0, 0.0, (double *)d_v, length, data->queue); } ceed_magma_queue_sync(data->queue); } - CeedInt n_array[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_N_VALUES}; - CeedInt iN = 0; - CeedInt diff = abs(n_array[iN] - N); + // Apply basis operation + if (e_mode != CEED_EVAL_WEIGHT) { + if (P < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) { + CeedInt n_array[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_KERNEL_N_VALUES}; + CeedInt iN = 0, diff = abs(n_array[iN] - N), idiff; + CeedInt M = (t_mode == CEED_TRANSPOSE) ? P : Q, K = (t_mode == CEED_TRANSPOSE) ? Q : P; + + for (CeedInt in = iN + 1; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { + idiff = abs(n_array[in] - N); + if (idiff < diff) { + iN = in; + diff = idiff; + } + } - for (CeedInt in = iN + 1; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { - CeedInt idiff = abs(n_array[in] - N); - if (idiff < diff) { - iN = in; - diff = idiff; - } - } + // Compile kernels for N as needed + if (!impl->NB_interp[iN]) { + Ceed ceed_delegate; + char *interp_kernel_path, *grad_kernel_path, *basis_kernel_source; + magma_int_t arch = magma_getdevice_arch(); + + // Tuning parameters for NB + impl->NB_interp[iN] = nontensor_rtc_get_nb(arch, 'n', 1, P, Q, n_array[iN]); + impl->NB_interp_t[iN] = nontensor_rtc_get_nb(arch, 't', 1, P, Q, n_array[iN]); + impl->NB_grad[iN] = nontensor_rtc_get_nb(arch, 'n', dim, P, Q, n_array[iN]); + impl->NB_grad_t[iN] = nontensor_rtc_get_nb(arch, 't', dim, P, Q, n_array[iN]); + + // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data + CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); + + // Compile kernels + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-interp-nontensor.h", &interp_kernel_path)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, interp_kernel_path, &basis_kernel_source)); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-grad-nontensor.h", &grad_kernel_path)); + CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_kernel_path, &basis_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); + CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_interp[iN], 7, "BASIS_DIM", dim, "BASIS_P", P, "BASIS_Q", + Q, "BASIS_NB_INTERP_N", impl->NB_interp[iN], "BASIS_NB_INTERP_T", impl->NB_interp_t[iN], "BASIS_NB_GRAD_N", + impl->NB_grad[iN], "BASIS_NB_GRAD_T", impl->NB_grad_t[iN])); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_interp_nontensor_n", &impl->Interp[iN])); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_interp_nontensor_t", &impl->InterpTranspose[iN])); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_grad_nontensor_n", &impl->Grad[iN])); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_grad_nontensor_t", &impl->GradTranspose[iN])); + CeedCallBackend(CeedFree(&interp_kernel_path)); + CeedCallBackend(CeedFree(&grad_kernel_path)); + CeedCallBackend(CeedFree(&basis_kernel_source)); + } - NB = nontensor_rtc_get_nb(arch, 'd', e_mode, t_mode, P, n_array[iN], Q); - interp = (t_mode == CEED_TRANSPOSE) ? &impl->magma_interp_tr_nontensor[iN] : &impl->magma_interp_nontensor[iN]; - grad = (t_mode == CEED_TRANSPOSE) ? &impl->magma_grad_tr_nontensor[iN] : &impl->magma_grad_nontensor[iN]; + // Apply basis operation + CeedInt num_t_col = MAGMA_BASIS_NTCOL(M, MAGMA_MAXTHREADS_1D); + if (e_mode == CEED_EVAL_INTERP) { + CeedInt NB = (t_mode == CEED_TRANSPOSE) ? impl->NB_interp_t[iN] : impl->NB_interp[iN]; + CeedInt grid = CeedDivUpInt(N, NB * num_t_col); + CeedInt shared_mem_A = (t_mode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar); + CeedInt shared_mem_B = num_t_col * K * NB * sizeof(CeedScalar); + CeedInt shared_mem = (t_mode == CEED_TRANSPOSE) ? (shared_mem_A + shared_mem_B) : CeedIntMax(shared_mem_A, shared_mem_B); + void *args[] = {&N, &impl->d_interp, &P, &d_u, &K, &d_v, &M}; - switch (e_mode) { - case CEED_EVAL_INTERP: { - CeedInt P = num_dof, Q = num_qpts; - if (P < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) { - CeedInt M = (t_mode == CEED_TRANSPOSE) ? P : Q; - CeedInt K = (t_mode == CEED_TRANSPOSE) ? Q : P; - CeedInt num_t_col = MAGMA_NONTENSOR_BASIS_NTCOL(M); - CeedInt shared_mem = 0, shared_mem_A = 0, shared_mem_B = 0; - shared_mem_B += num_t_col * K * NB * sizeof(CeedScalar); - shared_mem_A += (t_mode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar); - shared_mem = (t_mode == CEED_TRANSPOSE) ? (shared_mem_A + shared_mem_B) : CeedIntMax(shared_mem_A, shared_mem_B); - - CeedInt grid = MAGMA_CEILDIV(MAGMA_CEILDIV(N, NB), num_t_col); - magma_trans_t trans_A = (t_mode == CEED_TRANSPOSE) ? MagmaNoTrans : MagmaTrans; - magma_trans_t trans_B = MagmaNoTrans; - CeedScalar alpha = 1.0, beta = 0.0; - - void *args[] = {&trans_A, &trans_B, &N, &alpha, &impl->d_interp, &P, &du, &K, &beta, &dv, &M}; - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, *interp, grid, M, num_t_col, 1, shared_mem, args)); - } else { if (t_mode == CEED_TRANSPOSE) { - magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, num_elem * num_comp, Q, 1.0, impl->d_interp, P, du, Q, 0.0, dv, P, data->queue); + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->InterpTranspose[iN], grid, M, num_t_col, 1, shared_mem, args)); } else { - magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, num_elem * num_comp, P, 1.0, impl->d_interp, P, du, P, 0.0, dv, Q, data->queue); + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Interp[iN], grid, M, num_t_col, 1, shared_mem, args)); } - } - } break; + } else if (e_mode == CEED_EVAL_GRAD) { + CeedInt NB = (t_mode == CEED_TRANSPOSE) ? impl->NB_grad_t[iN] : impl->NB_grad[iN]; + CeedInt grid = CeedDivUpInt(N, NB * num_t_col); + CeedInt shared_mem = num_t_col * K * NB * sizeof(CeedScalar) + ((t_mode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar)); + void *args[] = {&N, &impl->d_grad, &P, &d_u, &K, &d_v, &M}; - case CEED_EVAL_GRAD: { - CeedInt P = num_dof, Q = num_qpts; - if (P < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) { - CeedInt M = (t_mode == CEED_TRANSPOSE) ? P : Q; - CeedInt K = (t_mode == CEED_TRANSPOSE) ? Q : P; - CeedInt num_t_col = MAGMA_NONTENSOR_BASIS_NTCOL(M); - CeedInt shared_mem = 0, shared_mem_A = 0, shared_mem_B = 0; - shared_mem_B += num_t_col * K * NB * sizeof(CeedScalar); - shared_mem_A += (t_mode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar); - shared_mem = shared_mem_A + shared_mem_B; - - CeedInt grid = MAGMA_CEILDIV(MAGMA_CEILDIV(N, NB), num_t_col); - magma_trans_t trans_A = (t_mode == CEED_TRANSPOSE) ? MagmaNoTrans : MagmaTrans; - magma_trans_t trans_B = MagmaNoTrans; - - void *args[] = {&trans_A, &trans_B, &N, &impl->d_grad, &P, &du, &K, &dv, &M}; - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, *grad, grid, M, num_t_col, 1, shared_mem, args)); + if (t_mode == CEED_TRANSPOSE) { + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->GradTranspose[iN], grid, M, num_t_col, 1, shared_mem, args)); + } else { + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Grad[iN], grid, M, num_t_col, 1, shared_mem, args)); + } } else { + // LCOV_EXCL_START + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV, CEED_EVAL_CURL not supported"); + // LCOV_EXCL_STOP + } + } else { + if (e_mode == CEED_EVAL_INTERP) { + if (t_mode == CEED_TRANSPOSE) { + magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, N, Q, 1.0, impl->d_interp, P, d_u, Q, 0.0, d_v, P, data->queue); + } else { + magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, N, P, 1.0, impl->d_interp, P, d_u, P, 0.0, d_v, Q, data->queue); + } + } else if (e_mode == CEED_EVAL_GRAD) { if (t_mode == CEED_TRANSPOSE) { - CeedScalar beta = 0.0; for (int d = 0; d < dim; d++) { - if (d > 0) beta = 1.0; - magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, num_elem * num_comp, Q, 1.0, impl->d_grad + d * P * Q, P, - du + d * num_elem * num_comp * Q, Q, beta, dv, P, data->queue); + const CeedScalar beta = (d > 0) ? 1.0 : 0.0; + magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, N, Q, 1.0, impl->d_grad + d * P * Q, P, d_u + d * N * Q, Q, beta, d_v, P, + data->queue); } } else { - for (int d = 0; d < dim; d++) - magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, num_elem * num_comp, P, 1.0, impl->d_grad + d * P * Q, P, du, P, 0.0, - dv + d * num_elem * num_comp * Q, Q, data->queue); + for (int d = 0; d < dim; d++) { + magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, N, P, 1.0, impl->d_grad + d * P * Q, P, d_u, P, 0.0, d_v + d * N * Q, Q, data->queue); + } } + } else { + // LCOV_EXCL_START + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV, CEED_EVAL_CURL not supported"); + // LCOV_EXCL_STOP } - } break; - - case CEED_EVAL_WEIGHT: { - CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT inum_compatible with CEED_TRANSPOSE"); - - int elemsPerBlock = 1; // basis->Q_1d < 7 ? optElems[basis->Q_1d] : 1; - int grid = num_elem / elemsPerBlock + ((num_elem / elemsPerBlock * elemsPerBlock < num_elem) ? 1 : 0); - - magma_weight_nontensor(grid, num_qpts, num_elem, num_qpts, impl->d_q_weight, dv, data->queue); - } break; - - // LCOV_EXCL_START - case CEED_EVAL_DIV: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); - case CEED_EVAL_CURL: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); - case CEED_EVAL_NONE: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); - // LCOV_EXCL_STOP + } + } else { + CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); + CeedInt num_t_col = MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D); + CeedInt grid = CeedDivUpInt(num_elem, num_t_col); + CeedInt shared_mem = Q * sizeof(CeedScalar) + num_t_col * Q * sizeof(CeedScalar); + void *args[] = {&num_elem, &impl->d_q_weight, &d_v, &Q}; + + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, grid, Q, num_t_col, 1, shared_mem, args)); } - // must sync to ensure completeness + // Must sync to ensure completeness ceed_magma_queue_sync(data->queue); + // Restore vectors if (e_mode != CEED_EVAL_WEIGHT) { - CeedCallBackend(CeedVectorRestoreArrayRead(U, &du)); + CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); } - CeedCallBackend(CeedVectorRestoreArray(V, &dv)); + CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); return CEED_ERROR_SUCCESS; } -#ifdef __cplusplus -CEED_INTERN "C" -#endif - int - CeedBasisDestroy_Magma(CeedBasis basis) { +//------------------------------------------------------------------------------ +// Destroy tensor basis +//------------------------------------------------------------------------------ +static int CeedBasisDestroy_Magma(CeedBasis basis) { Ceed ceed; CeedBasis_Magma *impl; - CeedCallBackend(CeedBasisGetData(basis, &impl)); - CeedCallBackend(magma_free(impl->d_q_ref_1d)); - CeedCallBackend(magma_free(impl->d_interp_1d)); - CeedCallBackend(magma_free(impl->d_grad_1d)); - CeedCallBackend(magma_free(impl->d_q_weight_1d)); CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + CeedCallBackend(CeedBasisGetData(basis, &impl)); #ifdef CEED_MAGMA_USE_HIP CeedCallHip(ceed, hipModuleUnload(impl->module)); #else CeedCallCuda(ceed, cuModuleUnload(impl->module)); #endif + CeedCallBackend(magma_free(impl->d_interp_1d)); + CeedCallBackend(magma_free(impl->d_grad_1d)); + CeedCallBackend(magma_free(impl->d_q_weight_1d)); CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } -#ifdef __cplusplus -CEED_INTERN "C" -#endif - int - CeedBasisDestroyNonTensor_Magma(CeedBasis basis) { +//------------------------------------------------------------------------------ +// Destroy non-tensor basis +//------------------------------------------------------------------------------ +static int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) { Ceed ceed; CeedBasisNonTensor_Magma *impl; - CeedCallBackend(CeedBasisGetData(basis, &impl)); - CeedCallBackend(magma_free(impl->d_q_ref)); - CeedCallBackend(magma_free(impl->d_interp)); - CeedCallBackend(magma_free(impl->d_grad)); - CeedCallBackend(magma_free(impl->d_q_weight)); CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + CeedCallBackend(CeedBasisGetData(basis, &impl)); #ifdef CEED_MAGMA_USE_HIP - for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { - CeedCallHip(ceed, hipModuleUnload(impl->module[in])); - } + CeedCallHip(ceed, hipModuleUnload(impl->module_weight)); #else + CeedCallCuda(ceed, cuModuleUnload(impl->module_weight)); +#endif for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { - CeedCallCuda(ceed, cuModuleUnload(impl->module[in])); - } + if (impl->module_interp[in]) { +#ifdef CEED_MAGMA_USE_HIP + CeedCallHip(ceed, hipModuleUnload(impl->module_interp[in])); +#else + CeedCallCuda(ceed, cuModuleUnload(impl->module_interp[in])); #endif + } + } + CeedCallBackend(magma_free(impl->d_interp)); + CeedCallBackend(magma_free(impl->d_grad)); + CeedCallBackend(magma_free(impl->d_q_weight)); CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } -#ifdef __cplusplus -CEED_INTERN "C" -#endif - int - CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, +//------------------------------------------------------------------------------ +// Create tensor +//------------------------------------------------------------------------------ +int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { Ceed ceed, ceed_delegate; Ceed_Magma *data; - char *magma_common_path, *interp_path, *grad_path, *weight_path, *basis_kernel_source; - CeedInt num_comp = 0; + char *interp_kernel_path, *grad_kernel_path, *weight_kernel_path, *basis_kernel_source; + CeedInt num_comp; CeedBasis_Magma *impl; - CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - - // Check for supported parameters - CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); CeedCallBackend(CeedGetData(ceed, &data)); + CeedCallBackend(CeedCalloc(1, &impl)); + + // Copy basis data to GPU + CeedCallBackend(magma_malloc((void **)&impl->d_q_weight_1d, Q_1d * sizeof(q_weight_1d[0]))); + magma_setvector(Q_1d, sizeof(q_weight_1d[0]), q_weight_1d, 1, impl->d_q_weight_1d, 1, data->queue); + CeedCallBackend(magma_malloc((void **)&impl->d_interp_1d, Q_1d * P_1d * sizeof(interp_1d[0]))); + magma_setvector(Q_1d * P_1d, sizeof(interp_1d[0]), interp_1d, 1, impl->d_interp_1d, 1, data->queue); + CeedCallBackend(magma_malloc((void **)&impl->d_grad_1d, Q_1d * P_1d * sizeof(grad_1d[0]))); + magma_setvector(Q_1d * P_1d, sizeof(grad_1d[0]), grad_1d, 1, impl->d_grad_1d, 1, data->queue); + + // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data + CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); // Compile kernels - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_defs.h", &magma_common_path)); + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + { + char *interp_kernel_name_base = "ceed/jit-source/magma/magma-basis-interp"; + CeedInt interp_kernel_name_len = strlen(interp_kernel_name_base) + 6; + char interp_kernel_name[interp_kernel_name_len]; + + snprintf(interp_kernel_name, interp_kernel_name_len, "%s-%" CeedInt_FMT "d.h", interp_kernel_name_base, dim); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, interp_kernel_name, &interp_kernel_path)); + } CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, magma_common_path, &basis_kernel_source)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_tensor.h", &magma_common_path)); - CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, magma_common_path, &basis_kernel_source)); - char *interp_name_base = "ceed/jit-source/magma/interp"; - CeedInt interp_name_len = strlen(interp_name_base) + 6; - char interp_name[interp_name_len]; - - snprintf(interp_name, interp_name_len, "%s-%" CeedInt_FMT "d.h", interp_name_base, dim); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, interp_name, &interp_path)); - CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, interp_path, &basis_kernel_source)); - char *grad_name_base = "ceed/jit-source/magma/grad"; - CeedInt grad_name_len = strlen(grad_name_base) + 6; - char grad_name[grad_name_len]; - - snprintf(grad_name, grad_name_len, "%s-%" CeedInt_FMT "d.h", grad_name_base, dim); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, grad_name, &grad_path)); - CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_path, &basis_kernel_source)); - char *weight_name_base = "ceed/jit-source/magma/weight"; - CeedInt weight_name_len = strlen(weight_name_base) + 6; - char weight_name[weight_name_len]; - - snprintf(weight_name, weight_name_len, "%s-%" CeedInt_FMT "d.h", weight_name_base, dim); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, weight_name, &weight_path)); - CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, weight_path, &basis_kernel_source)); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, interp_kernel_path, &basis_kernel_source)); + { + char *grad_kernel_name_base = "ceed/jit-source/magma/magma-basis-grad"; + CeedInt grad_kernel_name_len = strlen(grad_kernel_name_base) + 6; + char grad_kernel_name[grad_kernel_name_len]; + + snprintf(grad_kernel_name, grad_kernel_name_len, "%s-%" CeedInt_FMT "d.h", grad_kernel_name_base, dim); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, grad_kernel_name, &grad_kernel_path)); + } + CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_kernel_path, &basis_kernel_source)); + { + char *weight_kernel_name_base = "ceed/jit-source/magma/magma-basis-weight"; + CeedInt weight_kernel_name_len = strlen(weight_kernel_name_base) + 6; + char weight_kernel_name[weight_kernel_name_len]; + + snprintf(weight_kernel_name, weight_kernel_name_len, "%s-%" CeedInt_FMT "d.h", weight_kernel_name_base, dim); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, weight_kernel_name, &weight_kernel_path)); + } + CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, weight_kernel_path, &basis_kernel_source)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); - // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip - // data - CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); - CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module, 5, "DIM", dim, "NCOMP", num_comp, "P", P_1d, "Q", Q_1d, "MAXPQ", - CeedIntMax(P_1d, Q_1d))); - - // Kernel setup + CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module, 5, "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_P", + P_1d, "BASIS_Q", Q_1d, "BASIS_MAX_P_Q", CeedIntMax(P_1d, Q_1d))); switch (dim) { case 1: - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", &impl->magma_interp)); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", &impl->magma_interp_tr)); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", &impl->magma_grad)); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", &impl->magma_grad_tr)); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", &impl->magma_weight)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", &impl->Interp)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", &impl->InterpTranspose)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", &impl->Grad)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", &impl->GradTranspose)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", &impl->Weight)); break; case 2: - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", &impl->magma_interp)); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", &impl->magma_interp_tr)); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", &impl->magma_grad)); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", &impl->magma_grad_tr)); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", &impl->magma_weight)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", &impl->Interp)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", &impl->InterpTranspose)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", &impl->Grad)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", &impl->GradTranspose)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", &impl->Weight)); break; case 3: - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", &impl->magma_interp)); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", &impl->magma_interp_tr)); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", &impl->magma_grad)); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", &impl->magma_grad_tr)); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", &impl->magma_weight)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", &impl->Interp)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", &impl->InterpTranspose)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", &impl->Grad)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", &impl->GradTranspose)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", &impl->Weight)); + break; } + CeedCallBackend(CeedFree(&interp_kernel_path)); + CeedCallBackend(CeedFree(&grad_kernel_path)); + CeedCallBackend(CeedFree(&weight_kernel_path)); + CeedCallBackend(CeedFree(&basis_kernel_source)); + + CeedCallBackend(CeedBasisSetData(basis, impl)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Magma)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Magma)); - - // Copy q_ref_1d to the GPU - CeedCallBackend(magma_malloc((void **)&impl->d_q_ref_1d, Q_1d * sizeof(q_ref_1d[0]))); - magma_setvector(Q_1d, sizeof(q_ref_1d[0]), q_ref_1d, 1, impl->d_q_ref_1d, 1, data->queue); - - // Copy interp_1d to the GPU - CeedCallBackend(magma_malloc((void **)&impl->d_interp_1d, Q_1d * P_1d * sizeof(interp_1d[0]))); - magma_setvector(Q_1d * P_1d, sizeof(interp_1d[0]), interp_1d, 1, impl->d_interp_1d, 1, data->queue); - - // Copy grad_1d to the GPU - CeedCallBackend(magma_malloc((void **)&impl->d_grad_1d, Q_1d * P_1d * sizeof(grad_1d[0]))); - magma_setvector(Q_1d * P_1d, sizeof(grad_1d[0]), grad_1d, 1, impl->d_grad_1d, 1, data->queue); - - // Copy q_weight_1d to the GPU - CeedCallBackend(magma_malloc((void **)&impl->d_q_weight_1d, Q_1d * sizeof(q_weight_1d[0]))); - magma_setvector(Q_1d, sizeof(q_weight_1d[0]), q_weight_1d, 1, impl->d_q_weight_1d, 1, data->queue); - - CeedCallBackend(CeedBasisSetData(basis, impl)); - CeedCallBackend(CeedFree(&magma_common_path)); - CeedCallBackend(CeedFree(&interp_path)); - CeedCallBackend(CeedFree(&grad_path)); - CeedCallBackend(CeedFree(&weight_path)); - CeedCallBackend(CeedFree(&basis_kernel_source)); return CEED_ERROR_SUCCESS; } -#ifdef __cplusplus -CEED_INTERN "C" -#endif - int - CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_dof, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, +//------------------------------------------------------------------------------ +// Create non-tensor H^1 +//------------------------------------------------------------------------------ +int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { Ceed ceed, ceed_delegate; Ceed_Magma *data; - char *magma_common_path, *interp_path, *grad_path, *basis_kernel_source; + char *weight_kernel_path, *basis_kernel_source; CeedBasisNonTensor_Magma *impl; CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedGetData(ceed, &data)); - magma_int_t arch = magma_getdevice_arch(); - CeedCallBackend(CeedCalloc(1, &impl)); - // Compile kernels - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_defs.h", &magma_common_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, magma_common_path, &basis_kernel_source)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_nontensor.h", &magma_common_path)); - CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, magma_common_path, &basis_kernel_source)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/interp-nontensor.h", &interp_path)); - CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, interp_path, &basis_kernel_source)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/grad-nontensor.h", &grad_path)); - CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_path, &basis_kernel_source)); - - // tuning parameters for nb - CeedInt nb_interp_n[MAGMA_NONTENSOR_KERNEL_INSTANCES]; - CeedInt nb_interp_t[MAGMA_NONTENSOR_KERNEL_INSTANCES]; - CeedInt nb_grad_n[MAGMA_NONTENSOR_KERNEL_INSTANCES]; - CeedInt nb_grad_t[MAGMA_NONTENSOR_KERNEL_INSTANCES]; - CeedInt P = num_dof, Q = num_qpts; - CeedInt n_array[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_N_VALUES}; - for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { - nb_interp_n[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_INTERP, CEED_NOTRANSPOSE, P, n_array[in], Q); - nb_interp_t[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_INTERP, CEED_TRANSPOSE, P, n_array[in], Q); - nb_grad_n[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_GRAD, CEED_NOTRANSPOSE, P, n_array[in], Q); - nb_grad_t[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_GRAD, CEED_TRANSPOSE, P, n_array[in], Q); - } + // Copy basis data to GPU + CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0]))); + magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue); + CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_nodes * sizeof(interp[0]))); + magma_setvector(num_qpts * num_nodes, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue); + CeedCallBackend(magma_malloc((void **)&impl->d_grad, num_qpts * num_nodes * dim * sizeof(grad[0]))); + magma_setvector(num_qpts * num_nodes * dim, sizeof(grad[0]), grad, 1, impl->d_grad, 1, data->queue); - // compile + // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); - for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { - CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[in], 7, "DIM", dim, "P", P, "Q", Q, "NB_INTERP_N", - nb_interp_n[in], "NB_INTERP_T", nb_interp_t[in], "NB_GRAD_N", nb_grad_n[in], "NB_GRAD_T", nb_grad_t[in])); - } - // get kernels - for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_interp_nontensor_n", &impl->magma_interp_nontensor[in])); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_interp_nontensor_t", &impl->magma_interp_tr_nontensor[in])); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_grad_nontensor_n", &impl->magma_grad_nontensor[in])); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_grad_nontensor_t", &impl->magma_grad_tr_nontensor[in])); - } + // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply) + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); + CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_weight, 1, "BASIS_Q", num_qpts)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_weight, "magma_weight_nontensor", &impl->Weight)); + CeedCallBackend(CeedFree(&weight_kernel_path)); + CeedCallBackend(CeedFree(&basis_kernel_source)); + + CeedCallBackend(CeedBasisSetData(basis, impl)); + // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma)); - - // Copy q_ref to the GPU - CeedCallBackend(magma_malloc((void **)&impl->d_q_ref, num_qpts * sizeof(q_ref[0]))); - magma_setvector(num_qpts, sizeof(q_ref[0]), q_ref, 1, impl->d_q_ref, 1, data->queue); - - // Copy interp to the GPU - CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_dof * sizeof(interp[0]))); - magma_setvector(num_qpts * num_dof, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue); - - // Copy grad to the GPU - CeedCallBackend(magma_malloc((void **)&impl->d_grad, num_qpts * num_dof * dim * sizeof(grad[0]))); - magma_setvector(num_qpts * num_dof * dim, sizeof(grad[0]), grad, 1, impl->d_grad, 1, data->queue); - - // Copy q_weight to the GPU - CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0]))); - magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue); - - CeedCallBackend(CeedBasisSetData(basis, impl)); - CeedCallBackend(CeedFree(&magma_common_path)); - CeedCallBackend(CeedFree(&interp_path)); - CeedCallBackend(CeedFree(&grad_path)); - CeedCallBackend(CeedFree(&basis_kernel_source)); return CEED_ERROR_SUCCESS; } + +//------------------------------------------------------------------------------ diff --git a/backends/magma/ceed-magma-common.c b/backends/magma/ceed-magma-common.c index dcd7598913..e995d00cd3 100644 --- a/backends/magma/ceed-magma-common.c +++ b/backends/magma/ceed-magma-common.c @@ -16,10 +16,10 @@ // Device information backend init //------------------------------------------------------------------------------ int CeedInit_Magma_common(Ceed ceed, const char *resource) { + Ceed_Magma *data; const char *device_spec = strstr(resource, ":device_id="); const int device_id = (device_spec) ? atoi(device_spec + 11) : -1; int current_device_id; - Ceed_Magma *data; CeedCallBackend(magma_init()); @@ -28,6 +28,7 @@ int CeedInit_Magma_common(Ceed ceed, const char *resource) { magma_setdevice(device_id); current_device_id = device_id; } + CeedCallBackend(CeedGetData(ceed, &data)); data->device_id = current_device_id; #ifdef CEED_MAGMA_USE_HIP diff --git a/backends/magma/magma_gemm_nontensor.c b/backends/magma/ceed-magma-gemm-nontensor.cpp similarity index 76% rename from backends/magma/magma_gemm_nontensor.c rename to backends/magma/ceed-magma-gemm-nontensor.cpp index e3b600ae49..257f23ab1d 100644 --- a/backends/magma/magma_gemm_nontensor.c +++ b/backends/magma/ceed-magma-gemm-nontensor.cpp @@ -5,7 +5,8 @@ // // This file is part of CEED: http://github.com/ceed -#include "ceed-magma.h" +#include "ceed-magma-gemm-nontensor.h" +#include "ceed-magma-gemm-selector.h" #ifdef CEED_MAGMA_USE_HIP #define devblasDgemmStridedBatched hipblasDgemmStridedBatched @@ -20,9 +21,9 @@ #endif //////////////////////////////////////////////////////////////////////////////// -static int magmablas_gemm(magma_trans_t trans_A, magma_trans_t trans_B, magma_int_t m, magma_int_t n, magma_int_t k, CeedScalar alpha, - const CeedScalar *d_A, magma_int_t ldda, const CeedScalar *d_B, magma_int_t lddb, CeedScalar beta, CeedScalar *d_C, - magma_int_t lddc, magma_queue_t queue) { +static inline int magmablas_gemm(magma_trans_t trans_A, magma_trans_t trans_B, magma_int_t m, magma_int_t n, magma_int_t k, CeedScalar alpha, + const CeedScalar *d_A, magma_int_t ldda, const CeedScalar *d_B, magma_int_t lddb, CeedScalar beta, CeedScalar *d_C, + magma_int_t lddc, magma_queue_t queue) { if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { magmablas_sgemm(trans_A, trans_B, m, n, k, (float)alpha, (const float *)d_A, ldda, (const float *)d_B, lddb, (float)beta, (float *)d_C, lddc, queue); @@ -34,10 +35,10 @@ static int magmablas_gemm(magma_trans_t trans_A, magma_trans_t trans_B, magma_in } //////////////////////////////////////////////////////////////////////////////// -static int magmablas_gemm_batched_strided(magma_trans_t trans_A, magma_trans_t trans_B, magma_int_t m, magma_int_t n, magma_int_t k, CeedScalar alpha, - const CeedScalar *d_A, magma_int_t ldda, magma_int_t strideA, const CeedScalar *d_B, magma_int_t lddb, - magma_int_t strideB, CeedScalar beta, CeedScalar *d_C, magma_int_t lddc, magma_int_t strideC, - magma_int_t batchCount, magma_queue_t queue) { +static inline int magmablas_gemm_batched_strided(magma_trans_t trans_A, magma_trans_t trans_B, magma_int_t m, magma_int_t n, magma_int_t k, + CeedScalar alpha, const CeedScalar *d_A, magma_int_t ldda, magma_int_t strideA, + const CeedScalar *d_B, magma_int_t lddb, magma_int_t strideB, CeedScalar beta, CeedScalar *d_C, + magma_int_t lddc, magma_int_t strideC, magma_int_t batchCount, magma_queue_t queue) { if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { magmablas_sgemm_batched_strided(trans_A, trans_B, m, n, k, (float)alpha, (const float *)d_A, ldda, strideA, (const float *)d_B, lddb, strideB, (float)beta, (float *)d_C, lddc, strideC, batchCount, queue); @@ -49,9 +50,9 @@ static int magmablas_gemm_batched_strided(magma_trans_t trans_A, magma_trans_t t } //////////////////////////////////////////////////////////////////////////////// -static int devblas_gemm(magma_trans_t trans_A, magma_trans_t trans_B, magma_int_t m, magma_int_t n, magma_int_t k, CeedScalar alpha, - const CeedScalar *d_A, magma_int_t ldda, const CeedScalar *d_B, magma_int_t lddb, CeedScalar beta, CeedScalar *d_C, - magma_int_t lddc, magma_queue_t queue) { +static inline int devblas_gemm(magma_trans_t trans_A, magma_trans_t trans_B, magma_int_t m, magma_int_t n, magma_int_t k, CeedScalar alpha, + const CeedScalar *d_A, magma_int_t ldda, const CeedScalar *d_B, magma_int_t lddb, CeedScalar beta, CeedScalar *d_C, + magma_int_t lddc, magma_queue_t queue) { if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { magma_sgemm(trans_A, trans_B, m, n, k, (float)alpha, (const float *)d_A, ldda, (const float *)d_B, lddb, (float)beta, (float *)d_C, lddc, queue); } else { @@ -62,10 +63,10 @@ static int devblas_gemm(magma_trans_t trans_A, magma_trans_t trans_B, magma_int_ } //////////////////////////////////////////////////////////////////////////////// -static int devblas_gemm_batched_strided(magma_trans_t trans_A, magma_trans_t trans_B, magma_int_t m, magma_int_t n, magma_int_t k, CeedScalar alpha, - const CeedScalar *d_A, magma_int_t ldda, magma_int_t strideA, const CeedScalar *d_B, magma_int_t lddb, - magma_int_t strideB, CeedScalar beta, CeedScalar *d_C, magma_int_t lddc, magma_int_t strideC, - magma_int_t batchCount, magma_queue_t queue) { +static inline int devblas_gemm_batched_strided(magma_trans_t trans_A, magma_trans_t trans_B, magma_int_t m, magma_int_t n, magma_int_t k, + CeedScalar alpha, const CeedScalar *d_A, magma_int_t ldda, magma_int_t strideA, const CeedScalar *d_B, + magma_int_t lddb, magma_int_t strideB, CeedScalar beta, CeedScalar *d_C, magma_int_t lddc, + magma_int_t strideC, magma_int_t batchCount, magma_queue_t queue) { if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { devblasSgemmStridedBatched(magma_queue_get_devblas_handle(queue), devblas_trans_const(trans_A), devblas_trans_const(trans_B), (int)m, (int)n, (int)k, (const float *)&alpha, (const float *)d_A, (int)ldda, strideA, (const float *)d_B, (int)lddb, strideB, diff --git a/backends/magma/ceed-magma-gemm-nontensor.h b/backends/magma/ceed-magma-gemm-nontensor.h new file mode 100644 index 0000000000..d7e83a5fa3 --- /dev/null +++ b/backends/magma/ceed-magma-gemm-nontensor.h @@ -0,0 +1,18 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#ifndef CEED_MAGMA_GEMM_NONTENSOR_H +#define CEED_MAGMA_GEMM_NONTENSOR_H + +#include "ceed-magma.h" + +//////////////////////////////////////////////////////////////////////////////// +CEED_INTERN int magma_gemm_nontensor(magma_trans_t trans_A, magma_trans_t trans_B, magma_int_t m, magma_int_t n, magma_int_t k, CeedScalar alpha, + const CeedScalar *d_A, magma_int_t ldda, const CeedScalar *d_B, magma_int_t lddb, CeedScalar beta, + CeedScalar *d_C, magma_int_t lddc, magma_queue_t queue); + +#endif // CEED_MAGMA_GEMM_NONTENSOR_H diff --git a/backends/magma/ceed-magma-gemm-selector.cpp b/backends/magma/ceed-magma-gemm-selector.cpp new file mode 100644 index 0000000000..6f631ef987 --- /dev/null +++ b/backends/magma/ceed-magma-gemm-selector.cpp @@ -0,0 +1,139 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include +#include +#include + +#include "ceed-magma-gemm-selector.h" + +#include "tuning/indices.h" +#ifdef CEED_MAGMA_USE_HIP +#include "tuning/mi100.h" +#include "tuning/mi250x.h" +#include "tuning/mi250x_grad_rtc.h" +#include "tuning/mi250x_interp_rtc.h" +#else +#include "tuning/a100.h" +#include "tuning/a100_grad_rtc.h" +#include "tuning/a100_interp_rtc.h" +#include "tuning/v100.h" +#endif + +//////////////////////////////////////////////////////////////////////////////// +#ifdef CEED_MAGMA_USE_HIP +static inline auto gemm_selector_get_data(int gpu_arch, char precision, char trans_A) -> decltype(dgemm_nn_mi250x) { + if (gpu_arch >= 910) { + // gfx90a or newer + return (precision == 's') ? ((trans_A == 'n') ? sgemm_nn_mi250x : sgemm_tn_mi250x) : ((trans_A == 'n') ? dgemm_nn_mi250x : dgemm_tn_mi250x); + } else { + // gfx908 or older + return (precision == 's') ? ((trans_A == 'n') ? sgemm_nn_mi100 : sgemm_tn_mi100) : ((trans_A == 'n') ? dgemm_nn_mi100 : dgemm_tn_mi100); + } +} +#else +static inline auto gemm_selector_get_data(int gpu_arch, char precision, char trans_A) -> decltype(dgemm_nn_a100) { + if (gpu_arch >= 800) { + // sm80 or newer + return (precision == 's') ? ((trans_A == 'n') ? sgemm_nn_a100 : sgemm_tn_a100) : ((trans_A == 'n') ? dgemm_nn_a100 : dgemm_tn_a100); + } else { + // sm70 or older + return (precision == 's') ? ((trans_A == 'n') ? sgemm_nn_v100 : sgemm_tn_v100) : ((trans_A == 'n') ? dgemm_nn_v100 : dgemm_tn_v100); + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +void gemm_selector(int gpu_arch, char precision, char trans_A, int m, int n, int k, int *n_batch, int *use_magma) { + const auto &data = gemm_selector_get_data(gpu_arch, precision, trans_A); + int ir = -1; + double norm = std::numeric_limits::max(); + + for (size_t i = 0; i < data.size(); i++) { + const int &im = data[i][M_INDEX]; + const int &in = data[i][N_INDEX]; + const int &ik = data[i][K_INDEX]; + + double mdiff = (double)(im - m); + double ndiff = (double)(in - n); + double kdiff = (double)(ik - k); + double nrm = mdiff * mdiff + ndiff * ndiff + kdiff * kdiff; + + if (nrm < norm) { + norm = nrm; + ir = i; + } + + if (im == m && in == n && ik == k) { + // The input (m, n, k) exactly matches a record in `data`, no need to search further + break; + } + } + + if (ir >= 0) { + // If the closest match indicates that n = n_batch, that means calling the regular non-batch GEMM. + // So n_batch is set to n instead of the 'n_batch' entry of the matching record. + int n_ = data[ir][N_INDEX]; + int n_batch_ = data[ir][N_BATCH_INDEX]; + *n_batch = (n_ == n_batch_) ? n : n_batch_; + *use_magma = data[ir][USE_MAGMA_INDEX]; + } else { + *n_batch = n; + *use_magma = 0; + } +} + +////////////////////////////////////////////////////////////////////////////// +#ifdef CEED_MAGMA_USE_HIP +static inline auto nontensor_rtc_get_data(int gpu_arch, char trans_A, int q_comp) -> decltype(dinterp_n_mi250x) { + if (q_comp == 1) { + return (trans_A == 'n') ? dinterp_n_mi250x : dinterp_t_mi250x; + } else { + return (trans_A == 'n') ? dgrad_n_mi250x : dgrad_t_mi250x; + } +} +#else +static inline auto nontensor_rtc_get_data(int gpu_arch, char trans_A, int q_comp) -> decltype(dinterp_n_a100) { + if (q_comp == 1) { + return (trans_A == 'n') ? dinterp_n_a100 : dinterp_t_a100; + } else { + return (trans_A == 'n') ? dgrad_n_a100 : dgrad_t_a100; + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +CeedInt nontensor_rtc_get_nb(int gpu_arch, char trans_A, int q_comp, int P, int Q, int n) { + const auto &data = nontensor_rtc_get_data(gpu_arch, trans_A, q_comp); + int ir = -1; + double norm = std::numeric_limits::max(); + CeedInt m = (trans_A == 'n') ? Q : P; + CeedInt k = (trans_A == 'n') ? P : Q; + + for (size_t i = 0; i < data.size(); i++) { + const int &im = data[i][M_INDEX_RTC]; + const int &in = data[i][N_INDEX_RTC]; + const int &ik = data[i][K_INDEX_RTC]; + + double mdiff = (double)(im - m); + double ndiff = (double)(in - n); + double kdiff = (double)(ik - k); + double nrm = mdiff * mdiff + ndiff * ndiff + kdiff * kdiff; + + if (nrm < norm) { + norm = nrm; + ir = i; + } + + if (im == m && in == n && ik == k) { + // The input (m, n, k) exactly matches a record in `data`, no need to search further + break; + } + } + + return (ir >= 0) ? data[ir][NB_INDEX_RTC] : 1; +} diff --git a/backends/magma/ceed-magma-gemm-selector.h b/backends/magma/ceed-magma-gemm-selector.h new file mode 100644 index 0000000000..ce169b051b --- /dev/null +++ b/backends/magma/ceed-magma-gemm-selector.h @@ -0,0 +1,19 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#ifndef CEED_MAGMA_GEMM_SELECTOR_H +#define CEED_MAGMA_GEMM_SELECTOR_H + +#include "ceed-magma.h" + +//////////////////////////////////////////////////////////////////////////////// +CEED_INTERN void gemm_selector(int gpu_arch, char precision, char trans_A, int m, int n, int k, int *n_batch, int *use_magma); + +//////////////////////////////////////////////////////////////////////////////// +CEED_INTERN CeedInt nontensor_rtc_get_nb(int gpu_arch, char trans_A, int q_comp, int P, int Q, int n); + +#endif // CEED_MAGMA_GEMM_SELECTOR_H diff --git a/backends/magma/ceed-magma.h b/backends/magma/ceed-magma.h index fcba887fe3..b5495a61bb 100644 --- a/backends/magma/ceed-magma.h +++ b/backends/magma/ceed-magma.h @@ -16,24 +16,15 @@ #define MAGMA_MAXTHREADS_1D 128 #define MAGMA_MAXTHREADS_2D 128 #define MAGMA_MAXTHREADS_3D 64 -#define MAGMA_NONTENSOR_MAXTHREADS (128) -// Define macro for determining number of threads in y-direction -// for basis kernels +// Define macro for determining number of threads in y-direction for basis kernels #define MAGMA_BASIS_NTCOL(x, maxt) (((maxt) < (x)) ? 1 : ((maxt) / (x))) -#define MAGMA_NONTENSOR_BASIS_NTCOL(N) (CeedIntMax(1, (MAGMA_NONTENSOR_MAXTHREADS / (N)))) -#define MAGMA_CEILDIV(A, B) (((A) + (B)-1) / (B)) -#define MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P (40) -#define MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q (40) - -// Define macro for computing the total threads in a block -// for use with __launch_bounds__() -#define MAGMA_BASIS_BOUNDS(x, maxt) (x * MAGMA_BASIS_NTCOL(x, maxt)) - -// Define macro for non-tensor kernel instances -#define MAGMA_NONTENSOR_KERNEL_INSTANCES (5) -#define MAGMA_NONTENSOR_N_VALUES 10240, 51200, 102400, 512000, 1024000 +// Define macros for non-tensor kernel instances +#define MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P 40 +#define MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q 40 +#define MAGMA_NONTENSOR_KERNEL_INSTANCES 5 +#define MAGMA_NONTENSOR_KERNEL_N_VALUES 10240, 51200, 102400, 512000, 1024000 #ifdef CEED_MAGMA_USE_HIP typedef hipModule_t CeedMagmaModule; @@ -55,48 +46,38 @@ typedef CUfunction CeedMagmaFunction; typedef struct { CeedMagmaModule module; - CeedMagmaFunction magma_interp; - CeedMagmaFunction magma_interp_tr; - CeedMagmaFunction magma_grad; - CeedMagmaFunction magma_grad_tr; - CeedMagmaFunction magma_weight; - CeedScalar *d_q_ref_1d; + CeedMagmaFunction Interp; + CeedMagmaFunction InterpTranspose; + CeedMagmaFunction Grad; + CeedMagmaFunction GradTranspose; + CeedMagmaFunction Weight; CeedScalar *d_interp_1d; CeedScalar *d_grad_1d; CeedScalar *d_q_weight_1d; } CeedBasis_Magma; typedef struct { - CeedMagmaModule module[MAGMA_NONTENSOR_KERNEL_INSTANCES]; - CeedMagmaFunction magma_interp_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES]; - CeedMagmaFunction magma_interp_tr_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES]; - CeedMagmaFunction magma_grad_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES]; - CeedMagmaFunction magma_grad_tr_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES]; - CeedScalar *d_q_ref; + CeedMagmaModule module_weight, module_interp[MAGMA_NONTENSOR_KERNEL_INSTANCES]; + CeedMagmaFunction Interp[MAGMA_NONTENSOR_KERNEL_INSTANCES]; + CeedMagmaFunction InterpTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES]; + CeedMagmaFunction Grad[MAGMA_NONTENSOR_KERNEL_INSTANCES]; + CeedMagmaFunction GradTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES]; + CeedMagmaFunction Weight; + CeedInt NB_interp[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_interp_t[MAGMA_NONTENSOR_KERNEL_INSTANCES]; + CeedInt NB_grad[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_grad_t[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedScalar *d_interp; CeedScalar *d_grad; CeedScalar *d_q_weight; } CeedBasisNonTensor_Magma; -CEED_INTERN void magma_weight_nontensor(magma_int_t grid, magma_int_t threads, magma_int_t num_elem, magma_int_t Q, CeedScalar *d_q_weight, - CeedScalar *d_v, magma_queue_t queue); - -CEED_INTERN int magma_gemm_nontensor(magma_trans_t trans_A, magma_trans_t trans_B, magma_int_t m, magma_int_t n, magma_int_t k, CeedScalar alpha, - const CeedScalar *d_A, magma_int_t ldda, const CeedScalar *d_B, magma_int_t lddb, CeedScalar beta, - CeedScalar *d_C, magma_int_t lddc, magma_queue_t queue); - -CEED_INTERN void gemm_selector(int gpu_arch, char precision, char trans_A, int m, int n, int k, int *n_batch, int *use_magma); - -CEED_INTERN CeedInt nontensor_rtc_get_nb(int gpu_arch, char precision, CeedEvalMode e_mode, CeedTransposeMode t_mode, int P_, int N, int Q_); - -CEED_INTERN magma_int_t magma_isdevptr(const void *A); - CEED_INTERN int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis); -CEED_INTERN int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_dof, CeedInt num_qpts, const CeedScalar *interp, +CEED_INTERN int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis); +CEED_INTERN magma_int_t magma_isdevptr(const void *); + // Comment the line below to use the default magma_is_devptr function #define magma_is_devptr magma_isdevptr diff --git a/backends/magma/gemm_selector.cpp b/backends/magma/gemm_selector.cpp deleted file mode 100644 index ee45bb2f5c..0000000000 --- a/backends/magma/gemm_selector.cpp +++ /dev/null @@ -1,165 +0,0 @@ -#include -#include - -#include -#include -#include - -#include "./gemm_tuning/indices.h" -#include "ceed-magma.h" -#ifdef CEED_MAGMA_USE_HIP -#include "./gemm_tuning/mi100.h" -#include "./gemm_tuning/mi250x.h" -#include "./gemm_tuning/mi250x_grad_rtc.h" -#include "./gemm_tuning/mi250x_interp_rtc.h" -#else -#include "./gemm_tuning/a100.h" -#include "./gemm_tuning/a100_grad_rtc.h" -#include "./gemm_tuning/a100_interp_rtc.h" -#include "./gemm_tuning/v100.h" -#endif - -//////////////////////////////////////////////////////////////////////////////// -static void *gemm_selector_get_data(int gpu_arch, char precision, char trans_A) { -// a default -#ifdef CEED_MAGMA_USE_HIP - void *data = (void *)&sgemm_nn_mi250x; -#else - void *data = (void *)&sgemm_nn_a100; -#endif - -#ifdef CEED_MAGMA_USE_HIP - if (gpu_arch >= 910) { - // gfx90a or newer - data = (precision == 's') ? ((trans_A == 'n') ? (void *)&sgemm_nn_mi250x : (void *)&sgemm_tn_mi250x) - : ((trans_A == 'n') ? (void *)&dgemm_nn_mi250x : (void *)&dgemm_tn_mi250x); - } else { - // gfx908 or older - data = (precision == 's') ? ((trans_A == 'n') ? (void *)&sgemm_nn_mi100 : (void *)&sgemm_tn_mi100) - : ((trans_A == 'n') ? (void *)&dgemm_nn_mi100 : (void *)&dgemm_tn_mi100); - } -#else - if (gpu_arch >= 800) { - // sm80 or newer - data = (precision == 's') ? ((trans_A == 'n') ? (void *)&sgemm_nn_a100 : (void *)&sgemm_tn_a100) - : ((trans_A == 'n') ? (void *)&dgemm_nn_a100 : (void *)&dgemm_tn_a100); - } else { - // sm70 or older - data = (precision == 's') ? ((trans_A == 'n') ? (void *)&sgemm_nn_v100 : (void *)&sgemm_tn_v100) - : ((trans_A == 'n') ? (void *)&dgemm_nn_v100 : (void *)&dgemm_tn_v100); - } -#endif - - return data; -} - -//////////////////////////////////////////////////////////////////////////////// -void gemm_selector(int gpu_arch, char precision, char trans_A, int m, int n, int k, int *n_batch, int *use_magma) { - // defaults - *n_batch = n; - *use_magma = 0; - std::vector > *data = NULL; - data = (std::vector > *)gemm_selector_get_data(gpu_arch, precision, trans_A); - - int ir = -1; - double norm = std::numeric_limits::max(); - for (size_t i = 0; i < data->size(); i++) { - int im = (*data)[i][M_INDEX]; - int in = (*data)[i][N_INDEX]; - int ik = (*data)[i][K_INDEX]; - - double mdiff = (double)(im - m); - double ndiff = (double)(in - n); - double kdiff = (double)(ik - k); - - double nrm = sqrt(mdiff * mdiff + ndiff * ndiff + kdiff * kdiff); - - if (nrm < norm) { - norm = nrm; - ir = i; - } - - if (nrm == 0) { - // the input (m, n, k) exactly matches a record in `data` - // no need to search further - break; - } - } - - if (ir >= 0) { - *use_magma = (*data)[ir][USE_MAGMA_INDEX]; - // if the closest match indicates that n = n_batch, - // that means calling the regular non-batch gemm. - // So n_batch is set to n instead of the 'n_batch' - // entry of the matching record - int n_ = (*data)[ir][N_INDEX]; - int n_batch_ = (*data)[ir][N_BATCH_INDEX]; - *n_batch = (n_ == n_batch_) ? n : n_batch_; - } -} - -//////////////////////////////////////////////////////////////////////////////// -static void *nontensor_rtc_get_data(int gpu_arch, char precision, CeedEvalMode e_mode, CeedTransposeMode t_mode) { -// a default -#ifdef CEED_MAGMA_USE_HIP - void *data = (void *)&dinterp_n_mi250x; -#else - void *data = (void *)&dinterp_n_a100; -#endif - -#ifdef CEED_MAGMA_USE_HIP - if (e_mode == CEED_EVAL_INTERP) { - data = (t_mode == CEED_TRANSPOSE) ? (void *)&dinterp_t_mi250x : (void *)&dinterp_n_mi250x; - } else if (e_mode == CEED_EVAL_GRAD) { - data = (t_mode == CEED_TRANSPOSE) ? (void *)&dgrad_t_mi250x : (void *)&dgrad_n_mi250x; - } -#else - if (e_mode == CEED_EVAL_INTERP) { - data = (t_mode == CEED_TRANSPOSE) ? (void *)&dinterp_t_a100 : (void *)&dinterp_n_a100; - } else if (e_mode == CEED_EVAL_GRAD) { - data = (t_mode == CEED_TRANSPOSE) ? (void *)&dgrad_t_a100 : (void *)&dgrad_n_a100; - } -#endif - - return data; -} - -//////////////////////////////////////////////////////////////////////////////// -CeedInt nontensor_rtc_get_nb(int gpu_arch, char precision, CeedEvalMode e_mode, CeedTransposeMode t_mode, int P_, int N, int Q_) { - CeedInt P = (t_mode == CEED_TRANSPOSE) ? P_ : Q_; - CeedInt Q = (t_mode == CEED_TRANSPOSE) ? Q_ : P_; - CeedInt NB = 1; - - std::vector > *data = NULL; - data = (std::vector > *)nontensor_rtc_get_data(gpu_arch, precision, e_mode, t_mode); - - int ir = -1; - double norm = std::numeric_limits::max(); - for (size_t i = 0; i < data->size(); i++) { - int ip = (*data)[i][M_INDEX_RTC]; - int in = (*data)[i][N_INDEX_RTC]; - int iq = (*data)[i][K_INDEX_RTC]; - - double pdiff = (double)(ip - P); - double ndiff = (double)(in - N); - double qdiff = (double)(iq - Q); - double nrm = sqrt(pdiff * pdiff + ndiff * ndiff + qdiff * qdiff); - - if (nrm < norm) { - norm = nrm; - ir = i; - } - - if (nrm == 0) { - // the input (m, n, k) exactly matches a record in `data` - // no need to search further - break; - } - } - - if (ir >= 0) { - NB = (*data)[ir][NB_INDEX_RTC]; - } - - return NB; -} diff --git a/backends/magma/kernels/common/weight.h b/backends/magma/kernels/common/weight.h deleted file mode 100644 index 6bda3258e3..0000000000 --- a/backends/magma/kernels/common/weight.h +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_MAGMA_WEIGHT_H -#define CEED_MAGMA_WEIGHT_H - -#include - -#include "magma_v2.h" - -////////////////////////////////////////////////////////////////////////////////////////// -static __global__ void magma_weight_nontensor_kernel(const CeedInt nelem, const CeedInt Q, const CeedScalar *__restrict__ qweight, - CeedScalar *__restrict__ d_V) { - const int tid = threadIdx.x; - // TODO load qweight in shared memory if blockDim.z > 1? - for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x * blockDim.z) { - d_V[elem * Q + tid] = qweight[tid]; - } -} - -#endif // CEED_MAGMA_WEIGHT_H diff --git a/backends/magma/kernels/cuda/weight_generic.cu b/backends/magma/kernels/cuda/weight_generic.cu deleted file mode 100644 index 567e3c3c02..0000000000 --- a/backends/magma/kernels/cuda/weight_generic.cu +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "../common/weight.h" - -////////////////////////////////////////////////////////////////////////////////////////// -// NonTensor weight function -extern "C" void -magma_weight_nontensor(magma_int_t grid, magma_int_t threads, magma_int_t nelem, magma_int_t Q, - CeedScalar *dqweight, CeedScalar *dv, magma_queue_t queue) -{ - magma_weight_nontensor_kernel<<>>(nelem, Q, dqweight, dv); -} diff --git a/backends/magma/kernels/hip/weight_generic.hip.cpp b/backends/magma/kernels/hip/weight_generic.hip.cpp deleted file mode 100644 index 24fd090239..0000000000 --- a/backends/magma/kernels/hip/weight_generic.hip.cpp +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#include "../common/weight.h" -#include "hip/hip_runtime.h" - -////////////////////////////////////////////////////////////////////////////////////////// -// NonTensor weight function -extern "C" void magma_weight_nontensor(magma_int_t grid, magma_int_t threads, magma_int_t nelem, magma_int_t Q, CeedScalar *dqweight, CeedScalar *dv, - magma_queue_t queue) { - hipLaunchKernelGGL(magma_weight_nontensor_kernel, dim3(grid), dim3(threads), 0, magma_queue_get_hip_stream(queue), nelem, Q, dqweight, dv); -} diff --git a/backends/magma/gemm_tuning/a100.h b/backends/magma/tuning/a100.h similarity index 100% rename from backends/magma/gemm_tuning/a100.h rename to backends/magma/tuning/a100.h diff --git a/backends/magma/gemm_tuning/a100_grad_rtc.h b/backends/magma/tuning/a100_grad_rtc.h similarity index 100% rename from backends/magma/gemm_tuning/a100_grad_rtc.h rename to backends/magma/tuning/a100_grad_rtc.h diff --git a/backends/magma/gemm_tuning/a100_interp_rtc.h b/backends/magma/tuning/a100_interp_rtc.h similarity index 100% rename from backends/magma/gemm_tuning/a100_interp_rtc.h rename to backends/magma/tuning/a100_interp_rtc.h diff --git a/backends/magma/gemm_tuning/indices.h b/backends/magma/tuning/indices.h similarity index 100% rename from backends/magma/gemm_tuning/indices.h rename to backends/magma/tuning/indices.h diff --git a/backends/magma/gemm_tuning/mi100.h b/backends/magma/tuning/mi100.h similarity index 100% rename from backends/magma/gemm_tuning/mi100.h rename to backends/magma/tuning/mi100.h diff --git a/backends/magma/gemm_tuning/mi250x.h b/backends/magma/tuning/mi250x.h similarity index 100% rename from backends/magma/gemm_tuning/mi250x.h rename to backends/magma/tuning/mi250x.h diff --git a/backends/magma/gemm_tuning/mi250x_grad_rtc.h b/backends/magma/tuning/mi250x_grad_rtc.h similarity index 100% rename from backends/magma/gemm_tuning/mi250x_grad_rtc.h rename to backends/magma/tuning/mi250x_grad_rtc.h diff --git a/backends/magma/gemm_tuning/mi250x_interp_rtc.h b/backends/magma/tuning/mi250x_interp_rtc.h similarity index 100% rename from backends/magma/gemm_tuning/mi250x_interp_rtc.h rename to backends/magma/tuning/mi250x_interp_rtc.h diff --git a/backends/magma/gemm_tuning/v100.h b/backends/magma/tuning/v100.h similarity index 100% rename from backends/magma/gemm_tuning/v100.h rename to backends/magma/tuning/v100.h diff --git a/backends/magma/tuning/v100_rtc-b.h b/backends/magma/tuning/v100_rtc-b.h new file mode 100644 index 0000000000..51174bc4f0 --- /dev/null +++ b/backends/magma/tuning/v100_rtc-b.h @@ -0,0 +1,318 @@ +//////////////////////////////////////////////////////////////////////////////// +// auto-generated from data on v100 + +//////////////////////////////////////////////////////////////////////////////// +std::vector > drtc_t_v100 = { + {3, 1, 1024, 1, 1 }, + {3, 1, 1024, 2, 1 }, + {3, 1, 5120, 1, 1 }, + {3, 1, 5120, 2, 1 }, + {3, 1, 10240, 1, 1 }, + {3, 1, 10240, 2, 1 }, + {3, 1, 51200, 1, 1 }, + {3, 1, 51200, 2, 1 }, + {3, 1, 102400, 1, 1 }, + {3, 1, 102400, 2, 1 }, + {3, 1, 512000, 1, 1 }, + {3, 1, 512000, 2, 1 }, + {3, 1, 1024000, 1, 1 }, + {3, 1, 1024000, 2, 1 }, + {6, 3, 1024, 1, 1 }, + {6, 3, 1024, 2, 1 }, + {6, 3, 5120, 1, 1 }, + {6, 3, 5120, 2, 1 }, + {6, 3, 10240, 1, 1 }, + {6, 3, 10240, 2, 1 }, + {6, 3, 51200, 1, 1 }, + {6, 3, 51200, 2, 1 }, + {6, 3, 102400, 1, 1 }, + {6, 3, 102400, 2, 1 }, + {6, 3, 512000, 1, 1 }, + {6, 3, 512000, 2, 1 }, + {6, 3, 1024000, 1, 1 }, + {6, 3, 1024000, 2, 1 }, + {10, 6, 1024, 1, 1 }, + {10, 6, 1024, 2, 1 }, + {10, 6, 5120, 1, 1 }, + {10, 6, 5120, 2, 1 }, + {10, 6, 10240, 1, 1 }, + {10, 6, 10240, 2, 1 }, + {10, 6, 51200, 1, 1 }, + {10, 6, 51200, 2, 1 }, + {10, 6, 102400, 1, 1 }, + {10, 6, 102400, 2, 1 }, + {10, 6, 512000, 1, 1 }, + {10, 6, 512000, 2, 1 }, + {10, 6, 1024000, 1, 1 }, + {10, 6, 1024000, 2, 2 }, + {15, 12, 1024, 1, 1 }, + {15, 12, 1024, 2, 1 }, + {15, 12, 5120, 1, 1 }, + {15, 12, 5120, 2, 1 }, + {15, 12, 10240, 1, 1 }, + {15, 12, 10240, 2, 1 }, + {15, 12, 51200, 1, 1 }, + {15, 12, 51200, 2, 5 }, + {15, 12, 102400, 1, 1 }, + {15, 12, 102400, 2, 9 }, + {15, 12, 512000, 1, 1 }, + {15, 12, 512000, 2, 9 }, + {15, 12, 1024000, 1, 1 }, + {15, 12, 1024000, 2, 9 }, + {21, 16, 1024, 1, 1 }, + {21, 16, 1024, 2, 1 }, + {21, 16, 5120, 1, 2 }, + {21, 16, 5120, 2, 6 }, + {21, 16, 10240, 1, 5 }, + {21, 16, 10240, 2, 5 }, + {21, 16, 51200, 1, 4 }, + {21, 16, 51200, 2, 4 }, + {21, 16, 102400, 1, 4 }, + {21, 16, 102400, 2, 8 }, + {21, 16, 512000, 1, 4 }, + {21, 16, 512000, 2, 8 }, + {21, 16, 1024000, 1, 4 }, + {21, 16, 1024000, 2, 8 }, + {28, 25, 1024, 1, 1 }, + {28, 25, 1024, 2, 1 }, + {28, 25, 5120, 1, 3 }, + {28, 25, 5120, 2, 4 }, + {28, 25, 10240, 1, 8 }, + {28, 25, 10240, 2, 5 }, + {28, 25, 51200, 1, 3 }, + {28, 25, 51200, 2, 9 }, + {28, 25, 102400, 1, 9 }, + {28, 25, 102400, 2, 9 }, + {28, 25, 512000, 1, 9 }, + {28, 25, 512000, 2, 9 }, + {28, 25, 1024000, 1, 9 }, + {28, 25, 1024000, 2, 9 }, + {36, 33, 1024, 1, 1 }, + {36, 33, 1024, 2, 1 }, + {36, 33, 5120, 1, 6 }, + {36, 33, 5120, 2, 4 }, + {36, 33, 10240, 1, 4 }, + {36, 33, 10240, 2, 4 }, + {36, 33, 51200, 1, 14}, + {36, 33, 51200, 2, 10}, + {36, 33, 102400, 1, 14}, + {36, 33, 102400, 2, 10}, + {36, 33, 512000, 1, 28}, + {36, 33, 512000, 2, 18}, + {36, 33, 1024000, 1, 22}, + {36, 33, 1024000, 2, 20}, + {4, 1, 1024, 1, 1 }, + {4, 1, 1024, 3, 1 }, + {4, 1, 5120, 1, 1 }, + {4, 1, 5120, 3, 1 }, + {4, 1, 10240, 1, 1 }, + {4, 1, 10240, 3, 2 }, + {4, 1, 51200, 1, 2 }, + {4, 1, 51200, 3, 2 }, + {4, 1, 102400, 1, 2 }, + {4, 1, 102400, 3, 2 }, + {4, 1, 512000, 1, 1 }, + {4, 1, 512000, 3, 2 }, + {4, 1, 1024000, 1, 1 }, + {4, 1, 1024000, 3, 2 }, + {10, 4, 1024, 1, 1 }, + {10, 4, 1024, 3, 2 }, + {10, 4, 5120, 1, 1 }, + {10, 4, 5120, 3, 1 }, + {10, 4, 10240, 1, 1 }, + {10, 4, 10240, 3, 1 }, + {10, 4, 51200, 1, 1 }, + {10, 4, 51200, 3, 2 }, + {10, 4, 102400, 1, 1 }, + {10, 4, 102400, 3, 2 }, + {10, 4, 512000, 1, 1 }, + {10, 4, 512000, 3, 2 }, + {10, 4, 1024000, 1, 1 }, + {10, 4, 1024000, 3, 2 }, + {20, 11, 1024, 1, 3 }, + {20, 11, 1024, 3, 3 }, + {20, 11, 5120, 1, 3 }, + {20, 11, 5120, 3, 2 }, + {20, 11, 10240, 1, 3 }, + {20, 11, 10240, 3, 3 }, + {20, 11, 51200, 1, 11}, + {20, 11, 51200, 3, 5 }, + {20, 11, 102400, 1, 2 }, + {20, 11, 102400, 3, 5 }, + {20, 11, 512000, 1, 2 }, + {20, 11, 512000, 3, 5 }, + {20, 11, 1024000, 1, 2 }, + {20, 11, 1024000, 3, 11}, + {35, 24, 1024, 1, 1 }, + {35, 24, 1024, 3, 1 }, + {35, 24, 5120, 1, 4 }, + {35, 24, 5120, 3, 5 }, + {35, 24, 10240, 1, 9 }, + {35, 24, 10240, 3, 5 }, + {35, 24, 51200, 1, 3 }, + {35, 24, 51200, 3, 9 }, + {35, 24, 102400, 1, 3 }, + {35, 24, 102400, 3, 9 }, + {35, 24, 512000, 1, 3 }, + {35, 24, 512000, 3, 9 }, + {35, 24, 1024000, 1, 3 }, + {35, 24, 1024000, 3, 9 } +}; + +//////////////////////////////////////////////////////////////////////////////// +std::vector > drtc_n_v100 = { + {3, 1, 1024, 1, 1 }, + {3, 1, 1024, 2, 1 }, + {3, 1, 5120, 1, 1 }, + {3, 1, 5120, 2, 1 }, + {3, 1, 10240, 1, 1 }, + {3, 1, 10240, 2, 1 }, + {3, 1, 51200, 1, 1 }, + {3, 1, 51200, 2, 1 }, + {3, 1, 102400, 1, 1 }, + {3, 1, 102400, 2, 1 }, + {3, 1, 512000, 1, 1 }, + {3, 1, 512000, 2, 1 }, + {3, 1, 1024000, 1, 1 }, + {3, 1, 1024000, 2, 1 }, + {6, 3, 1024, 1, 1 }, + {6, 3, 1024, 2, 1 }, + {6, 3, 5120, 1, 1 }, + {6, 3, 5120, 2, 1 }, + {6, 3, 10240, 1, 1 }, + {6, 3, 10240, 2, 1 }, + {6, 3, 51200, 1, 1 }, + {6, 3, 51200, 2, 1 }, + {6, 3, 102400, 1, 1 }, + {6, 3, 102400, 2, 1 }, + {6, 3, 512000, 1, 1 }, + {6, 3, 512000, 2, 1 }, + {6, 3, 1024000, 1, 1 }, + {6, 3, 1024000, 2, 1 }, + {10, 6, 1024, 1, 1 }, + {10, 6, 1024, 2, 1 }, + {10, 6, 5120, 1, 1 }, + {10, 6, 5120, 2, 1 }, + {10, 6, 10240, 1, 1 }, + {10, 6, 10240, 2, 1 }, + {10, 6, 51200, 1, 1 }, + {10, 6, 51200, 2, 2 }, + {10, 6, 102400, 1, 2 }, + {10, 6, 102400, 2, 1 }, + {10, 6, 512000, 1, 1 }, + {10, 6, 512000, 2, 1 }, + {10, 6, 1024000, 1, 1 }, + {10, 6, 1024000, 2, 1 }, + {15, 12, 1024, 1, 1 }, + {15, 12, 1024, 2, 1 }, + {15, 12, 5120, 1, 1 }, + {15, 12, 5120, 2, 2 }, + {15, 12, 10240, 1, 2 }, + {15, 12, 10240, 2, 2 }, + {15, 12, 51200, 1, 3 }, + {15, 12, 51200, 2, 13}, + {15, 12, 102400, 1, 5 }, + {15, 12, 102400, 2, 5 }, + {15, 12, 512000, 1, 3 }, + {15, 12, 512000, 2, 5 }, + {15, 12, 1024000, 1, 3 }, + {15, 12, 1024000, 2, 5 }, + {21, 16, 1024, 1, 1 }, + {21, 16, 1024, 2, 4 }, + {21, 16, 5120, 1, 2 }, + {21, 16, 5120, 2, 2 }, + {21, 16, 10240, 1, 2 }, + {21, 16, 10240, 2, 4 }, + {21, 16, 51200, 1, 5 }, + {21, 16, 51200, 2, 6 }, + {21, 16, 102400, 1, 5 }, + {21, 16, 102400, 2, 5 }, + {21, 16, 512000, 1, 9 }, + {21, 16, 512000, 2, 5 }, + {21, 16, 1024000, 1, 3 }, + {21, 16, 1024000, 2, 11}, + {28, 25, 1024, 1, 6 }, + {28, 25, 1024, 2, 3 }, + {28, 25, 5120, 1, 5 }, + {28, 25, 5120, 2, 5 }, + {28, 25, 10240, 1, 9 }, + {28, 25, 10240, 2, 9 }, + {28, 25, 51200, 1, 6 }, + {28, 25, 51200, 2, 5 }, + {28, 25, 102400, 1, 6 }, + {28, 25, 102400, 2, 5 }, + {28, 25, 512000, 1, 6 }, + {28, 25, 512000, 2, 15}, + {28, 25, 1024000, 1, 6 }, + {28, 25, 1024000, 2, 9 }, + {36, 33, 1024, 1, 5 }, + {36, 33, 1024, 2, 6 }, + {36, 33, 5120, 1, 5 }, + {36, 33, 5120, 2, 11}, + {36, 33, 10240, 1, 9 }, + {36, 33, 10240, 2, 23}, + {36, 33, 51200, 1, 9 }, + {36, 33, 51200, 2, 13}, + {36, 33, 102400, 1, 9 }, + {36, 33, 102400, 2, 13}, + {36, 33, 512000, 1, 9 }, + {36, 33, 512000, 2, 13}, + {36, 33, 1024000, 1, 9 }, + {36, 33, 1024000, 2, 13}, + {4, 1, 1024, 1, 1 }, + {4, 1, 1024, 3, 1 }, + {4, 1, 5120, 1, 1 }, + {4, 1, 5120, 3, 1 }, + {4, 1, 10240, 1, 1 }, + {4, 1, 10240, 3, 1 }, + {4, 1, 51200, 1, 1 }, + {4, 1, 51200, 3, 1 }, + {4, 1, 102400, 1, 1 }, + {4, 1, 102400, 3, 1 }, + {4, 1, 512000, 1, 1 }, + {4, 1, 512000, 3, 1 }, + {4, 1, 1024000, 1, 1 }, + {4, 1, 1024000, 3, 1 }, + {10, 4, 1024, 1, 1 }, + {10, 4, 1024, 3, 1 }, + {10, 4, 5120, 1, 1 }, + {10, 4, 5120, 3, 1 }, + {10, 4, 10240, 1, 1 }, + {10, 4, 10240, 3, 1 }, + {10, 4, 51200, 1, 1 }, + {10, 4, 51200, 3, 5 }, + {10, 4, 102400, 1, 1 }, + {10, 4, 102400, 3, 1 }, + {10, 4, 512000, 1, 1 }, + {10, 4, 512000, 3, 1 }, + {10, 4, 1024000, 1, 1 }, + {10, 4, 1024000, 3, 1 }, + {20, 11, 1024, 1, 3 }, + {20, 11, 1024, 3, 3 }, + {20, 11, 5120, 1, 3 }, + {20, 11, 5120, 3, 6 }, + {20, 11, 10240, 1, 3 }, + {20, 11, 10240, 3, 3 }, + {20, 11, 51200, 1, 6 }, + {20, 11, 51200, 3, 5 }, + {20, 11, 102400, 1, 5 }, + {20, 11, 102400, 3, 5 }, + {20, 11, 512000, 1, 10}, + {20, 11, 512000, 3, 7 }, + {20, 11, 1024000, 1, 6 }, + {20, 11, 1024000, 3, 7 }, + {35, 24, 1024, 1, 1 }, + {35, 24, 1024, 3, 6 }, + {35, 24, 5120, 1, 5 }, + {35, 24, 5120, 3, 5 }, + {35, 24, 10240, 1, 6 }, + {35, 24, 10240, 3, 9 }, + {35, 24, 51200, 1, 5 }, + {35, 24, 51200, 3, 6 }, + {35, 24, 102400, 1, 5 }, + {35, 24, 102400, 3, 10}, + {35, 24, 512000, 1, 5 }, + {35, 24, 512000, 3, 8 }, + {35, 24, 1024000, 1, 5 }, + {35, 24, 1024000, 3, 8 } +}; diff --git a/backends/magma/tuning/v100_rtc-new.h b/backends/magma/tuning/v100_rtc-new.h new file mode 100644 index 0000000000..f76b8e10ce --- /dev/null +++ b/backends/magma/tuning/v100_rtc-new.h @@ -0,0 +1,626 @@ +//////////////////////////////////////////////////////////////////////////////// +// auto-generated from data on v100 + +//////////////////////////////////////////////////////////////////////////////// +std::vector > drtc_t_v100 = { + {3, 1, 1024, 1, 1 }, + {3, 1, 1024, 2, 1 }, + {3, 1, 5120, 1, 1 }, + {3, 1, 5120, 2, 1 }, + {3, 1, 10240, 1, 1 }, + {3, 1, 10240, 2, 1 }, + {3, 1, 51200, 1, 1 }, + {3, 1, 51200, 2, 1 }, + {3, 1, 102400, 1, 1 }, + {3, 1, 102400, 2, 1 }, + {3, 1, 512000, 1, 1 }, + {3, 1, 512000, 2, 1 }, + {3, 1, 1024000, 1, 1 }, + {3, 1, 1024000, 2, 1 }, + {6, 3, 1024, 1, 1 }, + {6, 3, 1024, 2, 1 }, + {6, 3, 5120, 1, 1 }, + {6, 3, 5120, 2, 1 }, + {6, 3, 10240, 1, 1 }, + {6, 3, 10240, 2, 1 }, + {6, 3, 51200, 1, 1 }, + {6, 3, 51200, 2, 1 }, + {6, 3, 102400, 1, 1 }, + {6, 3, 102400, 2, 1 }, + {6, 3, 512000, 1, 1 }, + {6, 3, 512000, 2, 1 }, + {6, 3, 1024000, 1, 1 }, + {6, 3, 1024000, 2, 1 }, + {10, 6, 1024, 1, 1 }, + {10, 6, 1024, 2, 1 }, + {10, 6, 5120, 1, 1 }, + {10, 6, 5120, 2, 1 }, + {10, 6, 10240, 1, 1 }, + {10, 6, 10240, 2, 1 }, + {10, 6, 51200, 1, 1 }, + {10, 6, 51200, 2, 1 }, + {10, 6, 102400, 1, 1 }, + {10, 6, 102400, 2, 1 }, + {10, 6, 512000, 1, 1 }, + {10, 6, 512000, 2, 2 }, + {10, 6, 1024000, 1, 1 }, + {10, 6, 1024000, 2, 2 }, + {15, 12, 1024, 1, 1 }, + {15, 12, 1024, 2, 1 }, + {15, 12, 5120, 1, 1 }, + {15, 12, 5120, 2, 1 }, + {15, 12, 10240, 1, 1 }, + {15, 12, 10240, 2, 1 }, + {15, 12, 51200, 1, 1 }, + {15, 12, 51200, 2, 5 }, + {15, 12, 102400, 1, 1 }, + {15, 12, 102400, 2, 9 }, + {15, 12, 512000, 1, 1 }, + {15, 12, 512000, 2, 9 }, + {15, 12, 1024000, 1, 1 }, + {15, 12, 1024000, 2, 9 }, + {21, 16, 1024, 1, 1 }, + {21, 16, 1024, 2, 1 }, + {21, 16, 5120, 1, 1 }, + {21, 16, 5120, 2, 3 }, + {21, 16, 10240, 1, 2 }, + {21, 16, 10240, 2, 5 }, + {21, 16, 51200, 1, 4 }, + {21, 16, 51200, 2, 4 }, + {21, 16, 102400, 1, 4 }, + {21, 16, 102400, 2, 8 }, + {21, 16, 512000, 1, 4 }, + {21, 16, 512000, 2, 8 }, + {21, 16, 1024000, 1, 4 }, + {21, 16, 1024000, 2, 4 }, + {28, 25, 1024, 1, 1 }, + {28, 25, 1024, 2, 1 }, + {28, 25, 5120, 1, 3 }, + {28, 25, 5120, 2, 4 }, + {28, 25, 10240, 1, 7 }, + {28, 25, 10240, 2, 8 }, + {28, 25, 51200, 1, 3 }, + {28, 25, 51200, 2, 4 }, + {28, 25, 102400, 1, 9 }, + {28, 25, 102400, 2, 9 }, + {28, 25, 512000, 1, 9 }, + {28, 25, 512000, 2, 9 }, + {28, 25, 1024000, 1, 9 }, + {28, 25, 1024000, 2, 9 }, + {36, 33, 1024, 1, 1 }, + {36, 33, 1024, 2, 1 }, + {36, 33, 5120, 1, 6 }, + {36, 33, 5120, 2, 6 }, + {36, 33, 10240, 1, 4 }, + {36, 33, 10240, 2, 4 }, + {36, 33, 51200, 1, 7 }, + {36, 33, 51200, 2, 18}, + {36, 33, 102400, 1, 20}, + {36, 33, 102400, 2, 18}, + {36, 33, 512000, 1, 22}, + {36, 33, 512000, 2, 17}, + {36, 33, 1024000, 1, 28}, + {36, 33, 1024000, 2, 17}, + {3, 3, 1024, 1, 1 }, + {3, 3, 1024, 2, 1 }, + {3, 3, 5120, 1, 1 }, + {3, 3, 5120, 2, 1 }, + {3, 3, 10240, 1, 1 }, + {3, 3, 10240, 2, 1 }, + {3, 3, 51200, 1, 1 }, + {3, 3, 51200, 2, 1 }, + {3, 3, 102400, 1, 1 }, + {3, 3, 102400, 2, 1 }, + {3, 3, 512000, 1, 1 }, + {3, 3, 512000, 2, 1 }, + {3, 3, 1024000, 1, 1 }, + {3, 3, 1024000, 2, 1 }, + {6, 6, 1024, 1, 1 }, + {6, 6, 1024, 2, 1 }, + {6, 6, 5120, 1, 1 }, + {6, 6, 5120, 2, 1 }, + {6, 6, 10240, 1, 1 }, + {6, 6, 10240, 2, 1 }, + {6, 6, 51200, 1, 1 }, + {6, 6, 51200, 2, 1 }, + {6, 6, 102400, 1, 1 }, + {6, 6, 102400, 2, 1 }, + {6, 6, 512000, 1, 1 }, + {6, 6, 512000, 2, 1 }, + {6, 6, 1024000, 1, 1 }, + {6, 6, 1024000, 2, 1 }, + {10, 12, 1024, 1, 3 }, + {10, 12, 1024, 2, 5 }, + {10, 12, 5120, 1, 1 }, + {10, 12, 5120, 2, 3 }, + {10, 12, 10240, 1, 3 }, + {10, 12, 10240, 2, 3 }, + {10, 12, 51200, 1, 1 }, + {10, 12, 51200, 2, 6 }, + {10, 12, 102400, 1, 1 }, + {10, 12, 102400, 2, 6 }, + {10, 12, 512000, 1, 1 }, + {10, 12, 512000, 2, 6 }, + {10, 12, 1024000, 1, 1 }, + {10, 12, 1024000, 2, 6 }, + {15, 16, 1024, 1, 3 }, + {15, 16, 1024, 2, 3 }, + {15, 16, 5120, 1, 3 }, + {15, 16, 5120, 2, 3 }, + {15, 16, 10240, 1, 3 }, + {15, 16, 10240, 2, 3 }, + {15, 16, 51200, 1, 3 }, + {15, 16, 51200, 2, 3 }, + {15, 16, 102400, 1, 3 }, + {15, 16, 102400, 2, 3 }, + {15, 16, 512000, 1, 3 }, + {15, 16, 512000, 2, 16}, + {15, 16, 1024000, 1, 3 }, + {15, 16, 1024000, 2, 16}, + {21, 25, 1024, 1, 1 }, + {21, 25, 1024, 2, 1 }, + {21, 25, 5120, 1, 3 }, + {21, 25, 5120, 2, 3 }, + {21, 25, 10240, 1, 5 }, + {21, 25, 10240, 2, 6 }, + {21, 25, 51200, 1, 6 }, + {21, 25, 51200, 2, 6 }, + {21, 25, 102400, 1, 6 }, + {21, 25, 102400, 2, 6 }, + {21, 25, 512000, 1, 6 }, + {21, 25, 512000, 2, 6 }, + {21, 25, 1024000, 1, 6 }, + {21, 25, 1024000, 2, 6 }, + {28, 33, 1024, 1, 1 }, + {28, 33, 1024, 2, 1 }, + {28, 33, 5120, 1, 6 }, + {28, 33, 5120, 2, 4 }, + {28, 33, 10240, 1, 4 }, + {28, 33, 10240, 2, 4 }, + {28, 33, 51200, 1, 6 }, + {28, 33, 51200, 2, 6 }, + {28, 33, 102400, 1, 6 }, + {28, 33, 102400, 2, 6 }, + {28, 33, 512000, 1, 6 }, + {28, 33, 512000, 2, 6 }, + {28, 33, 1024000, 1, 6 }, + {28, 33, 1024000, 2, 6 }, + {4, 1, 1024, 1, 1 }, + {4, 1, 1024, 3, 1 }, + {4, 1, 5120, 1, 1 }, + {4, 1, 5120, 3, 1 }, + {4, 1, 10240, 1, 1 }, + {4, 1, 10240, 3, 1 }, + {4, 1, 51200, 1, 4 }, + {4, 1, 51200, 3, 3 }, + {4, 1, 102400, 1, 5 }, + {4, 1, 102400, 3, 3 }, + {4, 1, 512000, 1, 1 }, + {4, 1, 512000, 3, 4 }, + {4, 1, 1024000, 1, 1 }, + {4, 1, 1024000, 3, 2 }, + {10, 4, 1024, 1, 1 }, + {10, 4, 1024, 3, 1 }, + {10, 4, 5120, 1, 1 }, + {10, 4, 5120, 3, 1 }, + {10, 4, 10240, 1, 1 }, + {10, 4, 10240, 3, 1 }, + {10, 4, 51200, 1, 1 }, + {10, 4, 51200, 3, 2 }, + {10, 4, 102400, 1, 1 }, + {10, 4, 102400, 3, 2 }, + {10, 4, 512000, 1, 1 }, + {10, 4, 512000, 3, 2 }, + {10, 4, 1024000, 1, 1 }, + {10, 4, 1024000, 3, 2 }, + {20, 11, 1024, 1, 1 }, + {20, 11, 1024, 3, 1 }, + {20, 11, 5120, 1, 1 }, + {20, 11, 5120, 3, 2 }, + {20, 11, 10240, 1, 3 }, + {20, 11, 10240, 3, 3 }, + {20, 11, 51200, 1, 7 }, + {20, 11, 51200, 3, 5 }, + {20, 11, 102400, 1, 2 }, + {20, 11, 102400, 3, 5 }, + {20, 11, 512000, 1, 2 }, + {20, 11, 512000, 3, 11}, + {20, 11, 1024000, 1, 2 }, + {20, 11, 1024000, 3, 11}, + {20, 14, 1024, 1, 1 }, + {20, 14, 1024, 3, 1 }, + {20, 14, 5120, 1, 1 }, + {20, 14, 5120, 3, 6 }, + {20, 14, 10240, 1, 3 }, + {20, 14, 10240, 3, 6 }, + {20, 14, 51200, 1, 3 }, + {20, 14, 51200, 3, 3 }, + {20, 14, 102400, 1, 3 }, + {20, 14, 102400, 3, 3 }, + {20, 14, 512000, 1, 3 }, + {20, 14, 512000, 3, 13}, + {20, 14, 1024000, 1, 3 }, + {20, 14, 1024000, 3, 13}, + {35, 24, 1024, 1, 1 }, + {35, 24, 1024, 3, 1 }, + {35, 24, 5120, 1, 3 }, + {35, 24, 5120, 3, 9 }, + {35, 24, 10240, 1, 9 }, + {35, 24, 10240, 3, 15}, + {35, 24, 51200, 1, 3 }, + {35, 24, 51200, 3, 9 }, + {35, 24, 102400, 1, 3 }, + {35, 24, 102400, 3, 9 }, + {35, 24, 512000, 1, 3 }, + {35, 24, 512000, 3, 22}, + {35, 24, 1024000, 1, 3 }, + {35, 24, 1024000, 3, 22}, + {4, 4, 1024, 1, 1 }, + {4, 4, 1024, 3, 1 }, + {4, 4, 5120, 1, 1 }, + {4, 4, 5120, 3, 1 }, + {4, 4, 10240, 1, 1 }, + {4, 4, 10240, 3, 1 }, + {4, 4, 51200, 1, 1 }, + {4, 4, 51200, 3, 1 }, + {4, 4, 102400, 1, 1 }, + {4, 4, 102400, 3, 1 }, + {4, 4, 512000, 1, 1 }, + {4, 4, 512000, 3, 1 }, + {4, 4, 1024000, 1, 1 }, + {4, 4, 1024000, 3, 1 }, + {10, 11, 1024, 1, 1 }, + {10, 11, 1024, 3, 1 }, + {10, 11, 5120, 1, 1 }, + {10, 11, 5120, 3, 1 }, + {10, 11, 10240, 1, 1 }, + {10, 11, 10240, 3, 3 }, + {10, 11, 51200, 1, 1 }, + {10, 11, 51200, 3, 2 }, + {10, 11, 102400, 1, 1 }, + {10, 11, 102400, 3, 2 }, + {10, 11, 512000, 1, 1 }, + {10, 11, 512000, 3, 2 }, + {10, 11, 1024000, 1, 1 }, + {10, 11, 1024000, 3, 2 }, + {10, 14, 1024, 1, 1 }, + {10, 14, 1024, 3, 1 }, + {10, 14, 5120, 1, 1 }, + {10, 14, 5120, 3, 1 }, + {10, 14, 10240, 1, 1 }, + {10, 14, 10240, 3, 2 }, + {10, 14, 51200, 1, 3 }, + {10, 14, 51200, 3, 11}, + {10, 14, 102400, 1, 3 }, + {10, 14, 102400, 3, 3 }, + {10, 14, 512000, 1, 1 }, + {10, 14, 512000, 3, 3 }, + {10, 14, 1024000, 1, 1 }, + {10, 14, 1024000, 3, 3 }, + {20, 24, 1024, 1, 1 }, + {20, 24, 1024, 3, 1 }, + {20, 24, 5120, 1, 2 }, + {20, 24, 5120, 3, 3 }, + {20, 24, 10240, 1, 5 }, + {20, 24, 10240, 3, 5 }, + {20, 24, 51200, 1, 3 }, + {20, 24, 51200, 3, 19}, + {20, 24, 102400, 1, 15}, + {20, 24, 102400, 3, 19}, + {20, 24, 512000, 1, 9 }, + {20, 24, 512000, 3, 19}, + {20, 24, 1024000, 1, 9 }, + {20, 24, 1024000, 3, 19} +}; + +//////////////////////////////////////////////////////////////////////////////// +std::vector > drtc_n_v100 = { + {3, 1, 1024, 1, 1 }, + {3, 1, 1024, 2, 1 }, + {3, 1, 5120, 1, 1 }, + {3, 1, 5120, 2, 1 }, + {3, 1, 10240, 1, 1 }, + {3, 1, 10240, 2, 1 }, + {3, 1, 51200, 1, 1 }, + {3, 1, 51200, 2, 1 }, + {3, 1, 102400, 1, 1 }, + {3, 1, 102400, 2, 1 }, + {3, 1, 512000, 1, 1 }, + {3, 1, 512000, 2, 1 }, + {3, 1, 1024000, 1, 1 }, + {3, 1, 1024000, 2, 1 }, + {6, 3, 1024, 1, 1 }, + {6, 3, 1024, 2, 1 }, + {6, 3, 5120, 1, 1 }, + {6, 3, 5120, 2, 1 }, + {6, 3, 10240, 1, 1 }, + {6, 3, 10240, 2, 1 }, + {6, 3, 51200, 1, 1 }, + {6, 3, 51200, 2, 1 }, + {6, 3, 102400, 1, 1 }, + {6, 3, 102400, 2, 1 }, + {6, 3, 512000, 1, 1 }, + {6, 3, 512000, 2, 1 }, + {6, 3, 1024000, 1, 1 }, + {6, 3, 1024000, 2, 1 }, + {10, 6, 1024, 1, 1 }, + {10, 6, 1024, 2, 1 }, + {10, 6, 5120, 1, 1 }, + {10, 6, 5120, 2, 1 }, + {10, 6, 10240, 1, 1 }, + {10, 6, 10240, 2, 1 }, + {10, 6, 51200, 1, 1 }, + {10, 6, 51200, 2, 1 }, + {10, 6, 102400, 1, 2 }, + {10, 6, 102400, 2, 1 }, + {10, 6, 512000, 1, 1 }, + {10, 6, 512000, 2, 1 }, + {10, 6, 1024000, 1, 1 }, + {10, 6, 1024000, 2, 1 }, + {15, 12, 1024, 1, 1 }, + {15, 12, 1024, 2, 2 }, + {15, 12, 5120, 1, 3 }, + {15, 12, 5120, 2, 1 }, + {15, 12, 10240, 1, 4 }, + {15, 12, 10240, 2, 2 }, + {15, 12, 51200, 1, 3 }, + {15, 12, 51200, 2, 11}, + {15, 12, 102400, 1, 3 }, + {15, 12, 102400, 2, 3 }, + {15, 12, 512000, 1, 9 }, + {15, 12, 512000, 2, 5 }, + {15, 12, 1024000, 1, 9 }, + {15, 12, 1024000, 2, 5 }, + {21, 16, 1024, 1, 1 }, + {21, 16, 1024, 2, 1 }, + {21, 16, 5120, 1, 5 }, + {21, 16, 5120, 2, 2 }, + {21, 16, 10240, 1, 4 }, + {21, 16, 10240, 2, 4 }, + {21, 16, 51200, 1, 5 }, + {21, 16, 51200, 2, 5 }, + {21, 16, 102400, 1, 9 }, + {21, 16, 102400, 2, 5 }, + {21, 16, 512000, 1, 9 }, + {21, 16, 512000, 2, 7 }, + {21, 16, 1024000, 1, 5 }, + {21, 16, 1024000, 2, 7 }, + {28, 25, 1024, 1, 1 }, + {28, 25, 1024, 2, 5 }, + {28, 25, 5120, 1, 5 }, + {28, 25, 5120, 2, 5 }, + {28, 25, 10240, 1, 7 }, + {28, 25, 10240, 2, 5 }, + {28, 25, 51200, 1, 6 }, + {28, 25, 51200, 2, 5 }, + {28, 25, 102400, 1, 6 }, + {28, 25, 102400, 2, 10}, + {28, 25, 512000, 1, 6 }, + {28, 25, 512000, 2, 14}, + {28, 25, 1024000, 1, 16}, + {28, 25, 1024000, 2, 9 }, + {36, 33, 1024, 1, 6 }, + {36, 33, 1024, 2, 5 }, + {36, 33, 5120, 1, 11}, + {36, 33, 5120, 2, 8 }, + {36, 33, 10240, 1, 6 }, + {36, 33, 10240, 2, 9 }, + {36, 33, 51200, 1, 9 }, + {36, 33, 51200, 2, 9 }, + {36, 33, 102400, 1, 9 }, + {36, 33, 102400, 2, 10}, + {36, 33, 512000, 1, 9 }, + {36, 33, 512000, 2, 13}, + {36, 33, 1024000, 1, 9 }, + {36, 33, 1024000, 2, 13}, + {3, 3, 1024, 1, 1 }, + {3, 3, 1024, 2, 1 }, + {3, 3, 5120, 1, 1 }, + {3, 3, 5120, 2, 1 }, + {3, 3, 10240, 1, 1 }, + {3, 3, 10240, 2, 1 }, + {3, 3, 51200, 1, 1 }, + {3, 3, 51200, 2, 1 }, + {3, 3, 102400, 1, 1 }, + {3, 3, 102400, 2, 1 }, + {3, 3, 512000, 1, 1 }, + {3, 3, 512000, 2, 1 }, + {3, 3, 1024000, 1, 1 }, + {3, 3, 1024000, 2, 1 }, + {6, 6, 1024, 1, 1 }, + {6, 6, 1024, 2, 1 }, + {6, 6, 5120, 1, 1 }, + {6, 6, 5120, 2, 1 }, + {6, 6, 10240, 1, 1 }, + {6, 6, 10240, 2, 1 }, + {6, 6, 51200, 1, 1 }, + {6, 6, 51200, 2, 1 }, + {6, 6, 102400, 1, 1 }, + {6, 6, 102400, 2, 1 }, + {6, 6, 512000, 1, 1 }, + {6, 6, 512000, 2, 1 }, + {6, 6, 1024000, 1, 1 }, + {6, 6, 1024000, 2, 1 }, + {10, 12, 1024, 1, 3 }, + {10, 12, 1024, 2, 3 }, + {10, 12, 5120, 1, 3 }, + {10, 12, 5120, 2, 3 }, + {10, 12, 10240, 1, 5 }, + {10, 12, 10240, 2, 3 }, + {10, 12, 51200, 1, 5 }, + {10, 12, 51200, 2, 3 }, + {10, 12, 102400, 1, 5 }, + {10, 12, 102400, 2, 3 }, + {10, 12, 512000, 1, 5 }, + {10, 12, 512000, 2, 2 }, + {10, 12, 1024000, 1, 5 }, + {10, 12, 1024000, 2, 2 }, + {15, 16, 1024, 1, 3 }, + {15, 16, 1024, 2, 3 }, + {15, 16, 5120, 1, 3 }, + {15, 16, 5120, 2, 3 }, + {15, 16, 10240, 1, 4 }, + {15, 16, 10240, 2, 6 }, + {15, 16, 51200, 1, 7 }, + {15, 16, 51200, 2, 3 }, + {15, 16, 102400, 1, 5 }, + {15, 16, 102400, 2, 3 }, + {15, 16, 512000, 1, 5 }, + {15, 16, 512000, 2, 3 }, + {15, 16, 1024000, 1, 5 }, + {15, 16, 1024000, 2, 4 }, + {21, 25, 1024, 1, 3 }, + {21, 25, 1024, 2, 1 }, + {21, 25, 5120, 1, 5 }, + {21, 25, 5120, 2, 13}, + {21, 25, 10240, 1, 7 }, + {21, 25, 10240, 2, 7 }, + {21, 25, 51200, 1, 5 }, + {21, 25, 51200, 2, 4 }, + {21, 25, 102400, 1, 6 }, + {21, 25, 102400, 2, 4 }, + {21, 25, 512000, 1, 6 }, + {21, 25, 512000, 2, 4 }, + {21, 25, 1024000, 1, 6 }, + {21, 25, 1024000, 2, 4 }, + {28, 33, 1024, 1, 2 }, + {28, 33, 1024, 2, 5 }, + {28, 33, 5120, 1, 11}, + {28, 33, 5120, 2, 8 }, + {28, 33, 10240, 1, 9 }, + {28, 33, 10240, 2, 9 }, + {28, 33, 51200, 1, 6 }, + {28, 33, 51200, 2, 9 }, + {28, 33, 102400, 1, 6 }, + {28, 33, 102400, 2, 9 }, + {28, 33, 512000, 1, 6 }, + {28, 33, 512000, 2, 15}, + {28, 33, 1024000, 1, 6 }, + {28, 33, 1024000, 2, 15}, + {4, 1, 1024, 1, 1 }, + {4, 1, 1024, 3, 1 }, + {4, 1, 5120, 1, 1 }, + {4, 1, 5120, 3, 1 }, + {4, 1, 10240, 1, 1 }, + {4, 1, 10240, 3, 1 }, + {4, 1, 51200, 1, 1 }, + {4, 1, 51200, 3, 1 }, + {4, 1, 102400, 1, 1 }, + {4, 1, 102400, 3, 1 }, + {4, 1, 512000, 1, 1 }, + {4, 1, 512000, 3, 1 }, + {4, 1, 1024000, 1, 1 }, + {4, 1, 1024000, 3, 1 }, + {10, 4, 1024, 1, 1 }, + {10, 4, 1024, 3, 1 }, + {10, 4, 5120, 1, 1 }, + {10, 4, 5120, 3, 1 }, + {10, 4, 10240, 1, 1 }, + {10, 4, 10240, 3, 1 }, + {10, 4, 51200, 1, 1 }, + {10, 4, 51200, 3, 5 }, + {10, 4, 102400, 1, 1 }, + {10, 4, 102400, 3, 1 }, + {10, 4, 512000, 1, 1 }, + {10, 4, 512000, 3, 1 }, + {10, 4, 1024000, 1, 1 }, + {10, 4, 1024000, 3, 1 }, + {20, 11, 1024, 1, 1 }, + {20, 11, 1024, 3, 1 }, + {20, 11, 5120, 1, 6 }, + {20, 11, 5120, 3, 2 }, + {20, 11, 10240, 1, 6 }, + {20, 11, 10240, 3, 3 }, + {20, 11, 51200, 1, 6 }, + {20, 11, 51200, 3, 5 }, + {20, 11, 102400, 1, 6 }, + {20, 11, 102400, 3, 5 }, + {20, 11, 512000, 1, 5 }, + {20, 11, 512000, 3, 6 }, + {20, 11, 1024000, 1, 6 }, + {20, 11, 1024000, 3, 6 }, + {20, 14, 1024, 1, 1 }, + {20, 14, 1024, 3, 2 }, + {20, 14, 5120, 1, 8 }, + {20, 14, 5120, 3, 2 }, + {20, 14, 10240, 1, 5 }, + {20, 14, 10240, 3, 5 }, + {20, 14, 51200, 1, 5 }, + {20, 14, 51200, 3, 5 }, + {20, 14, 102400, 1, 5 }, + {20, 14, 102400, 3, 5 }, + {20, 14, 512000, 1, 5 }, + {20, 14, 512000, 3, 5 }, + {20, 14, 1024000, 1, 5 }, + {20, 14, 1024000, 3, 5 }, + {35, 24, 1024, 1, 1 }, + {35, 24, 1024, 3, 2 }, + {35, 24, 5120, 1, 5 }, + {35, 24, 5120, 3, 5 }, + {35, 24, 10240, 1, 6 }, + {35, 24, 10240, 3, 9 }, + {35, 24, 51200, 1, 5 }, + {35, 24, 51200, 3, 6 }, + {35, 24, 102400, 1, 5 }, + {35, 24, 102400, 3, 14}, + {35, 24, 512000, 1, 18}, + {35, 24, 512000, 3, 9 }, + {35, 24, 1024000, 1, 18}, + {35, 24, 1024000, 3, 9 }, + {4, 4, 1024, 1, 1 }, + {4, 4, 1024, 3, 1 }, + {4, 4, 5120, 1, 1 }, + {4, 4, 5120, 3, 1 }, + {4, 4, 10240, 1, 1 }, + {4, 4, 10240, 3, 1 }, + {4, 4, 51200, 1, 1 }, + {4, 4, 51200, 3, 1 }, + {4, 4, 102400, 1, 5 }, + {4, 4, 102400, 3, 3 }, + {4, 4, 512000, 1, 1 }, + {4, 4, 512000, 3, 1 }, + {4, 4, 1024000, 1, 1 }, + {4, 4, 1024000, 3, 1 }, + {10, 11, 1024, 1, 1 }, + {10, 11, 1024, 3, 1 }, + {10, 11, 5120, 1, 1 }, + {10, 11, 5120, 3, 1 }, + {10, 11, 10240, 1, 2 }, + {10, 11, 10240, 3, 2 }, + {10, 11, 51200, 1, 1 }, + {10, 11, 51200, 3, 1 }, + {10, 11, 102400, 1, 1 }, + {10, 11, 102400, 3, 1 }, + {10, 11, 512000, 1, 1 }, + {10, 11, 512000, 3, 1 }, + {10, 11, 1024000, 1, 1 }, + {10, 11, 1024000, 3, 1 }, + {10, 14, 1024, 1, 1 }, + {10, 14, 1024, 3, 1 }, + {10, 14, 5120, 1, 1 }, + {10, 14, 5120, 3, 1 }, + {10, 14, 10240, 1, 2 }, + {10, 14, 10240, 3, 7 }, + {10, 14, 51200, 1, 1 }, + {10, 14, 51200, 3, 1 }, + {10, 14, 102400, 1, 1 }, + {10, 14, 102400, 3, 1 }, + {10, 14, 512000, 1, 1 }, + {10, 14, 512000, 3, 1 }, + {10, 14, 1024000, 1, 1 }, + {10, 14, 1024000, 3, 1 }, + {20, 24, 1024, 1, 6 }, + {20, 24, 1024, 3, 3 }, + {20, 24, 5120, 1, 3 }, + {20, 24, 5120, 3, 5 }, + {20, 24, 10240, 1, 7 }, + {20, 24, 10240, 3, 9 }, + {20, 24, 51200, 1, 11}, + {20, 24, 51200, 3, 19}, + {20, 24, 102400, 1, 10}, + {20, 24, 102400, 3, 5 }, + {20, 24, 512000, 1, 14}, + {20, 24, 512000, 3, 6 }, + {20, 24, 1024000, 1, 13}, + {20, 24, 1024000, 3, 5 } +}; diff --git a/include/ceed/jit-source/magma/grad-1d.h b/include/ceed/jit-source/magma/grad-1d.h deleted file mode 100644 index 5eea0ee2f9..0000000000 --- a/include/ceed/jit-source/magma/grad-1d.h +++ /dev/null @@ -1,129 +0,0 @@ -// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -// macros to abstract access of shared memory and reg. file -#define sT(i, j) sT[(j)*P_ + (i)] - -////////////////////////////////////////////////////////////////////////////////////////// -// grad basis action (1D) -template -static __device__ __inline__ void magma_grad_1d_device(const T *sT, magma_trans_t transT, T *sU[NCOMP_], T *sV[NCOMP_], const int tx) { - // Assumptions - // 1. 1D threads of size max(P_,Q_) - // 2. sU[i] is 1xP_: in shared memory - // 3. sV[i] is 1xQ_: in shared memory - // 4. P_roduct per component is one row (1xP_) times T matrix (P_xQ_) => one row (1xQ_) - // 5. Each thread computes one entry in sV[i] - // 6. Must sync before and after call - // 7. Note that the layout for U and V is different from 2D/3D problem - - T rv; - if (tx < Q_) { - for (int icomp = 0; icomp < NCOMP_; icomp++) { - rv = (transT == MagmaTrans) ? sV[icomp][tx] : 0.0; - for (int i = 0; i < P_; i++) { - rv += sU[icomp][i] * sT(i, tx); - } - sV[icomp][tx] = rv; - } - } -} - -////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_1D)) __global__ - void magma_gradn_1d_kernel(const CeedScalar *dTinterp, const CeedScalar *dTgrad, const CeedScalar *dU, const int estrdU, const int cstrdU, - const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaNoTrans; - - if (elem_id >= nelem) return; - - CeedScalar *sU[NCOMP]; - CeedScalar *sV[NCOMP]; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar *sT = (CeedScalar *)(shared_data); - CeedScalar *sW = sT + P * Q; - sU[0] = sW + ty * NCOMP * (P + Q); - sV[0] = sU[0] + (NCOMP * 1 * P); - for (int icomp = 1; icomp < NCOMP; icomp++) { - sU[icomp] = sU[icomp - 1] + (1 * P); - sV[icomp] = sV[icomp - 1] + (1 * Q); - } - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dTgrad, sT); - } - - // read U - read_1d(dU, cstrdU, sU, tx); - - __syncthreads(); - magma_grad_1d_device(sT, transT, sU, sV, tx); - __syncthreads(); - - // write V - write_1d(sV, dV, cstrdV, tx); -} - -////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_1D)) __global__ - void magma_gradt_1d_kernel(const CeedScalar *dTinterp, const CeedScalar *dTgrad, const CeedScalar *dU, const int estrdU, const int cstrdU, - const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaTrans; - - if (elem_id >= nelem) return; - - CeedScalar *sU[NCOMP]; - CeedScalar *sV[NCOMP]; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar *sT = (CeedScalar *)(shared_data); - CeedScalar *sW = sT + Q * P; - sU[0] = sW + ty * NCOMP * (Q + P); - sV[0] = sU[0] + (NCOMP * 1 * Q); - for (int icomp = 1; icomp < NCOMP; icomp++) { - sU[icomp] = sU[icomp - 1] + (1 * Q); - sV[icomp] = sV[icomp - 1] + (1 * P); - } - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dTgrad, sT); - } - - // read U - read_1d(dU, cstrdU, sU, tx); - - // read V - read_1d(dV, cstrdV, sV, tx); - - __syncthreads(); - magma_grad_1d_device(sT, transT, sU, sV, tx); - __syncthreads(); - - // write V - write_1d(sV, dV, cstrdV, tx); -} diff --git a/include/ceed/jit-source/magma/grad-2d.h b/include/ceed/jit-source/magma/grad-2d.h deleted file mode 100644 index 1f2763ac9f..0000000000 --- a/include/ceed/jit-source/magma/grad-2d.h +++ /dev/null @@ -1,180 +0,0 @@ -// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -// macros to abstract access of shared memory and reg. file -#define sT(i, j) sT[(j)*P_ + (i)] -#define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)] - -////////////////////////////////////////////////////////////////////////////////////////// -// grad basis action (2D) -// This function is called two times at a higher level for 2D -// DIM_U -- for the size of rU[DIM_U * NCOMP_ * MAXP_Q_] -// DIM_V -- for the size of rV[DIM_V * NCOMP_ * MAXP_Q_] -// iDIM_ -- the index of the outermost loop over dimensions in grad -// iDIM_U -- which dim index of rU is accessed (always 0 for notrans, 0 or 1 for trans) -// iDIM_V -- which dim index of rV is accessed (0 or 1 for notrans, always 0 for trans) -// the scalar beta is used to specify whether to accumulate to rV, or overwrite it -template -static __device__ __inline__ void magma_grad_2d_device(const T *sTinterp, const T *sTgrad, T rU[DIM_U][NCOMP_][rUsize], T rV[DIM_V][NCOMP_][rVsize], - T beta, const int tx, T rTmp, T *swork) { - // Assumptions - // 0. This device routine applies grad for one dim only (iDIM_), so it should be called twice for 2D - // 1. 1D threads of size max(P_,Q_) - // 2. input: rU[DIM_U x NCOMP_ x P_] in registers (per thread) - // 3. output: rV[DIM_V x NCOMP_ x Q_] in registers (per thread) - // 4. Two products per each (dim,component) pair - // 4.1 Batch P_ of (1xP_) matrices times (P_xQ_) matrix => Batch P_ of (1xQ_) matrices - // 4.2 Batch 1 of (Q_xP_) matrix times (P_xQ_) matrix => (Q_xQ_) matrix - // 6. Each thread computes one row of the output of each product - // 7. Sync is recommended before and after the call - - for (int icomp = 0; icomp < NCOMP_; icomp++) { - // 1st product -- Batch P_ of (1xP_) matrices [reg] x (P_xQ_) [shmem] => Batch P_ of (1xQ_) matrices - // the batch output P_ x (1xQ_) is written on the fly to shmem - if (tx < P_) { - const int batchid = tx; - const int sld = 1; - const T *sT = (iDIM_ == 0) ? sTgrad : sTinterp; - T *sTmp = swork + batchid * (1 * Q_); - for (int j = 0; j < Q_; j++) { - rTmp = 0.0; - for (int i = 0; i < P_; i++) { - rTmp += rU[iDIM_U][icomp][i] * sT(i, j); - } - sTmp(0, j, sld) = rTmp; - } - } // end of: if (tx < P_) - __syncthreads(); - - // 2nd product -- Batch 1 of a (Q_xP_) matrix [shmem] x (P_xQ_) [shmem] => (Q_xQ_) matrix [reg] - if (tx < Q_) { - const int batchid = 0; - const int sld = Q_; - const T *sT = (iDIM_ == 1) ? sTgrad : sTinterp; - T *sTmp = swork + batchid * (Q_ * P_); - for (int j = 0; j < Q_; j++) { - rTmp = 0.0; - for (int i = 0; i < P_; i++) { - rTmp += sTmp(tx, i, sld) * sT(i, j); - } - rV[iDIM_V][icomp][j] *= beta; - rV[iDIM_V][icomp][j] += rTmp; - } - } - __syncthreads(); - } // loop over NCOMP_ -} - -////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_2D)) __global__ - void magma_gradn_2d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU, - const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaNoTrans; - - if (elem_id >= nelem) return; - - CeedScalar rU[1][NCOMP][P] = {0.0}; // here DIMU = 1, but might be different for a fused operator - CeedScalar rV[1][NCOMP][Q] = {0.0}; // here DIMV = 1, but might be different for a fused operator - CeedScalar rTmp = 0.0; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar *sTinterp = (CeedScalar *)(shared_data); - CeedScalar *sTgrad = sTinterp + P * Q; - CeedScalar *sTmp = sTgrad + P * Q; - sTmp += ty * (P * MAXPQ); - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dinterp1d, sTinterp); - dread_T_gm2sm(tx, transT, dgrad1d, sTgrad); - } - - // No need to read V ( required only in transposed grad ) - const CeedScalar beta = 0.0; - - /* read U (idim = 0 for dU, iDIM = 0 for rU) -- - there is a sync at the end of this function */ - readU_2d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); - - /* first call (iDIM = 0, iDIMU = 0, iDIMV = 0) -- - output from rV[0][][] into dV (idim = 0) */ - magma_grad_2d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); - /* there is a sync at the end of magma_grad_2d_device */ - writeV_2d(dV + (0 * dstrdV), cstrdV, rV, tx); - - /* second call (iDIM = 1, iDIMU = 0, iDIMV = 0) -- - output from rV[0][][] into dV (idim = 1) */ - magma_grad_2d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); - /* there is a sync at the end of magma_grad_2d_device */ - writeV_2d(dV + (1 * dstrdV), cstrdV, rV, tx); -} - -////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_2D)) __global__ - void magma_gradt_2d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU, - const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaTrans; - - if (elem_id >= nelem) return; - - CeedScalar rU[1][NCOMP][Q] = {0.0}; // here DIMU = 1, but might be different for a fused operator - CeedScalar rV[1][NCOMP][P] = {0.0}; // here DIMV = 1, but might be different for a fused operator - CeedScalar rTmp = 0.0; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar *sTinterp = (CeedScalar *)(shared_data); - CeedScalar *sTgrad = sTinterp + Q * P; - CeedScalar *sTmp = sTgrad + Q * P; - sTmp += ty * (Q * MAXPQ); - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dinterp1d, sTinterp); - dread_T_gm2sm(tx, transT, dgrad1d, sTgrad); - } - __syncthreads(); - - /* read V (since this is transposed mode -- - idim = 0 for dV, iDIM = 0 for rV) */ - const CeedScalar beta = 1.0; - readV_2d(dV + (0 * dstrdV), cstrdV, rV, tx); - - /* read U (idim = 0 for dU, iDIM = 0 for rU) -- - there is a sync at the end of this function */ - readU_2d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); - /* first call (iDIM = 0, iDIMU = 0, iDIMV = 0) */ - magma_grad_2d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); - /* there is a sync at the end of magma_grad_2d_device */ - - /* read U (idim = 1 for dU, iDIM = 0 for rU) -- - there is a sync at the end of this function */ - readU_2d(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx); - /* second call (iDIM = 1, iDIMU = 0, iDIMV = 0) */ - magma_grad_2d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); - /* there is a sync at the end of magma_grad_2d_device */ - - // write V - writeV_2d(dV + (0 * dstrdV), cstrdV, rV, tx); -} diff --git a/include/ceed/jit-source/magma/grad-3d.h b/include/ceed/jit-source/magma/grad-3d.h deleted file mode 100644 index 072c1da2f8..0000000000 --- a/include/ceed/jit-source/magma/grad-3d.h +++ /dev/null @@ -1,215 +0,0 @@ -// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -// macros to abstract access of shared memory and reg. file -#define sT(i, j) sT[(j)*P_ + (i)] -#define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)] -#define sTmp2(i, j, ldw) sTmp2[(j) * (ldw) + (i)] - -////////////////////////////////////////////////////////////////////////////////////////// -// grad basis action (3D) -// This function is called three times at a higher level for 3D -// DIM_U -- for the size of rU[DIM_U * NCOMP_ * MAXP_Q_] -// DIM_V -- for the size of rV[DIM_V * NCOMP_ * MAXP_Q_] -// iDIM_ -- the index of the outermost loop over dimensions in grad -// iDIM_U -- which dim index of rU is accessed (always 0 for notrans, 0, 1, or 2 for trans) -// iDIM_V -- which dim index of rV is accessed (0, 1, or 2 for notrans, always 0 for trans) -// the scalar beta is used to specify whether to accumulate to rV, or overwrite it -template -static __device__ __inline__ void magma_grad_3d_device(const T *sTinterp, const T *sTgrad, T rU[DIM_U][NCOMP_][rUsize], T rV[DIM_V][NCOMP_][rVsize], - T beta, const int tx, T rTmp, T *swork) { - // Assumptions - // 0. This device routine applies grad for one dim only (iDIM_), so it should be thrice for 3D - // 1. 1D threads of size max(P_,Q_)^2 - // 2. input: rU[DIM_U x NCOMP_ x rUsize] in registers (per thread) - // 3. output: rV[DIM_V x NCOMP_ x rVsize] in registers (per thread) - // 4. Three products per each (dim,component) pair - // 4.1 Batch P_^2 of (1xP_) matrices times (P_xQ_) matrix => Batch P_^2 of (1xQ_) matrices - // 4.2 Batch P_ of (Q_xP_) matrices times (P_xQ_) matrix => Batch P_ of (Q_xQ_) matrices - // 4.3 Batch 1 of (Q_^2xP_) matrix times (P_xQ_) matrix => (Q_^2xQ_) matrix - // 6. Each thread computes one row of the output of each product - // 7. Sync is recommended before and after the call - - T *sW1 = swork; - T *sW2 = sW1 + P_ * P_ * Q_; - for (int icomp = 0; icomp < NCOMP_; icomp++) { - // Batch P_^2 of (1xP_) matrices [reg] times (P_xQ_) matrix [shmem] => Batch P_^2 of (1xQ_) matrices [shmem] - if (tx < (P_ * P_)) { - const int batchid = tx; - const int sld = 1; - const T *sT = (iDIM_ == 0) ? sTgrad : sTinterp; - T *sTmp = sW1 + batchid * (1 * Q_); - for (int j = 0; j < Q_; j++) { - rTmp = 0.0; - for (int i = 0; i < P_; i++) { - rTmp += rU[iDIM_U][icomp][i] * sT(i, j); - } - sTmp(0, j, sld) = rTmp; - } - } // end of: if (tx < P_*P_) - __syncthreads(); - - // Batch P_ of (Q_xP_) matrices [shmem] times (P_xQ_) matrix [shmem] => Batch P_ of (Q_xQ_) matrices [reg] - if (tx < (P_ * Q_)) { - const int batchid = tx / Q_; - const int tx_ = tx % Q_; - const int sld = Q_; - const T *sT = (iDIM_ == 1) ? sTgrad : sTinterp; - T *sTmp = sW1 + batchid * (Q_ * P_); // sTmp is input - T *sTmp2 = sW2 + batchid * (Q_ * Q_); // sTmp2 is output - for (int j = 0; j < Q_; j++) { - rTmp = 0.0; - for (int i = 0; i < P_; i++) { - rTmp += sTmp(tx_, i, sld) * sT(i, j); - } - sTmp2(tx_, j, sld) = rTmp; - } - } - __syncthreads(); - - // Batch 1 of (Q_^2xP_) matrices [shmem] times (P_xQ_) matrix [shmem] => Batch 1 of (Q_^2xQ_) matrices [reg] - if (tx < (Q_ * Q_)) { - // No need to declare batchid = (tx / Q_^2) = always zero - // No need to declare tx_ = (tx_ % Q_^2) = always tx - const int sld = Q_ * Q_; - const T *sT = (iDIM_ == 2) ? sTgrad : sTinterp; - T *sTmp = sW2; // sTmp is input - for (int j = 0; j < Q_; j++) { - rTmp = 0.0; - for (int i = 0; i < P_; i++) { - rTmp += sTmp(tx, i, sld) * sT(i, j); - } - rV[iDIM_V][icomp][j] *= beta; - rV[iDIM_V][icomp][j] += rTmp; - } - } - __syncthreads(); - } // loop over NCOMP_ -} - -////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ *MAXPQ, MAGMA_MAXTHREADS_3D)) __global__ - void magma_gradn_3d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU, - const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaNoTrans; - - if (elem_id >= nelem) return; - - CeedScalar rU[1][NCOMP][P] = {0.0}; // here DIMU = 1, but might be different for a fused operator - CeedScalar rV[1][NCOMP][Q] = {0.0}; // here DIMV = 1, but might be different for a fused operator - CeedScalar rTmp = 0.0; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar *sTinterp = (CeedScalar *)(shared_data); - CeedScalar *sTgrad = sTinterp + P * Q; - CeedScalar *sTmp = sTgrad + P * Q; - sTmp += ty * (max(P * P * P, (P * P * Q) + (P * Q * Q))); - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dinterp1d, sTinterp); - dread_T_gm2sm(tx, transT, dgrad1d, sTgrad); - } - __syncthreads(); - - // No need to read V ( required only in transposed grad ) - const CeedScalar beta = 0.0; - - /* read U (idim = 0 for dU, iDIM = 0 for rU) -- - there is a sync at the end of this function */ - readU_3d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); - - /* first call (iDIM = 0, iDIMU = 0, iDIMV = 0) -- - output from rV[0][][] into dV (idim = 0) */ - magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); - /* there is a sync at the end of magma_grad_3d_device */ - writeV_3d(dV + (0 * dstrdV), cstrdV, rV, tx); - - /* second call (iDIM = 1, iDIMU = 0, iDIMV = 0) -- - output from rV[0][][] into dV (idim = 1) */ - magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); - /* there is a sync at the end of magma_grad_3d_device */ - writeV_3d(dV + (1 * dstrdV), cstrdV, rV, tx); - - /* third call (iDIM = 2, iDIMU = 0, iDIMV = 0) -- - output from rV[0][][] into dV (idim = 2) */ - magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); - /* there is a sync at the end of magma_grad_3d_device */ - writeV_3d(dV + (2 * dstrdV), cstrdV, rV, tx); -} - -////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ *MAXPQ, MAGMA_MAXTHREADS_3D)) __global__ - void magma_gradt_3d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU, - const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaTrans; - - if (elem_id >= nelem) return; - - CeedScalar rU[1][NCOMP][Q] = {0.0}; // here DIMU = 1, but might be different for a fused operator - CeedScalar rV[1][NCOMP][P] = {0.0}; // here DIMV = 1, but might be different for a fused operator - CeedScalar rTmp = 0.0; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar *sTinterp = (CeedScalar *)(shared_data); - CeedScalar *sTgrad = sTinterp + Q * P; - CeedScalar *sTmp = sTgrad + Q * P; - sTmp += ty * (max(Q * Q * Q, (Q * Q * P) + (Q * P * P))); - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dinterp1d, sTinterp); - dread_T_gm2sm(tx, transT, dgrad1d, sTgrad); - } - __syncthreads(); - - // read V (since this is transposed mode) - const CeedScalar beta = 1.0; - readV_3d(dV + (0 * dstrdV), cstrdV, rV, tx); - - /* read U (idim = 0 for dU, iDIM = 0 for rU) -- - there is a sync at the end of this function */ - readU_3d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); - /* then first call (iDIM = 0, iDIMU = 0, iDIMV = 0) */ - magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); - /* there is a sync at the end of magma_grad_3d_device */ - - /* read U (idim = 1 for dU, iDIM = 0 for rU) -- - there is a sync at the end of this function */ - readU_3d(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx); - /* then second call (iDIM = 1, iDIMU = 0, iDIMV = 0) */ - magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); - /* there is a sync at the end of magma_grad_3d_device */ - - /* read U (idim = 2 for dU, iDIM = 0 for rU) -- - there is a sync at the end of this function */ - readU_3d(dU + (2 * dstrdU), cstrdU, rU, sTmp, tx); - /* then third call (iDIM = 2, iDIMU = 0, iDIMV = 0) */ - magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); - /* there is a sync at the end of magma_grad_3d_device */ - - // write V - writeV_3d(dV + (0 * dstrdV), cstrdV, rV, tx); -} diff --git a/include/ceed/jit-source/magma/grad-nontensor.h b/include/ceed/jit-source/magma/grad-nontensor.h deleted file mode 100644 index 164f2c75a5..0000000000 --- a/include/ceed/jit-source/magma/grad-nontensor.h +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_MAGMA_GRAD_NONTENSOR_H -#define CEED_MAGMA_GRAD_NONTENSOR_H - -//////////////////////////////////////////////////////////////////////////////// -// Different A's and C's, same B -extern "C" __global__ __launch_bounds__(Q *MAGMA_NONTENSOR_BASIS_NTCOL(Q)) void magma_grad_nontensor_n(magma_trans_t transA, magma_trans_t transB, - int n, CeedScalar const *dA, int ldda, - CeedScalar const *dB, int lddb, CeedScalar *dC, - int lddc) { - MAGMA_DEVICE_SHARED(CeedScalar, shared_data); - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int bx = blockIdx.x; - const int id = bx * blockDim.y + ty; - const int nblocks = MAGMA_CEILDIV(n, NB_GRAD_N); - const int myn = min(NB_GRAD_N, n - id * NB_GRAD_N); - - const double alpha = MAGMA_D_ONE; - - dB += id * NB_GRAD_N * lddb; - dC += id * NB_GRAD_N * lddc; - - // A is P x Q - const int slda = P; - const int sldb = P; - CeedScalar *sA = (CeedScalar *)(shared_data); - CeedScalar *sB = sA + Q * P; - sB += ty * sldb * NB_GRAD_N; - - // read B once for all C's - if (id < nblocks) { - read_B_g2s_1D_nosync(tx, myn, dB, lddb, sB, sldb); - } - __syncthreads(); - - // unrolling this loop yields dramatic performance drop using hipcc - // let the compiler decide (no pragma unroll) - for (int idim = 0; idim < DIM; idim++) { - // read A (P x Q) using all threads - CeedScalar rA[P] = {MAGMA_D_ZERO}; - read_A_trans_g2r_1D_nosync(tx, ty, dA, ldda, sA, slda, rA); - - __syncthreads(); - - // init rC - CeedScalar rC[NB_GRAD_N] = {MAGMA_D_ZERO}; - if (id < nblocks) { - mul_rAsBrC_1D_nosync(tx, alpha, rA, sB, sldb, rC); - } - __syncthreads(); - - if (id < nblocks) { - write_C_r2g_1D_nosync(tx, myn, rC, dC, lddc); - } - - dA += Q * P; - dC += Q * n; - } -} - -//////////////////////////////////////////////////////////////////////////////// -// Different A's and B's, same C -extern "C" __global__ __launch_bounds__(P *MAGMA_NONTENSOR_BASIS_NTCOL(P)) void magma_grad_nontensor_t(magma_trans_t transA, magma_trans_t transB, - int n, CeedScalar const *dA, int ldda, - CeedScalar const *dB, int lddb, CeedScalar *dC, - int lddc) { - MAGMA_DEVICE_SHARED(CeedScalar, shared_data); - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int bx = blockIdx.x; - const int id = bx * blockDim.y + ty; - const int nblocks = MAGMA_CEILDIV(n, NB_GRAD_T); - const int myn = min(NB_GRAD_T, n - id * NB_GRAD_T); - if (id >= nblocks) return; - - dB += id * NB_GRAD_T * lddb; - dC += id * NB_GRAD_T * lddc; - - const double alpha = MAGMA_D_ONE; - - // A is P x Q - const int sldb = Q; - CeedScalar *sB = (CeedScalar *)(shared_data); - sB += ty * sldb * NB_GRAD_T; - - // init rC - CeedScalar rC[NB_GRAD_T] = {MAGMA_D_ZERO}; - - CeedScalar rA[Q] = {MAGMA_D_ZERO}; - - // unrolling this loop yields dramatic performance drop using hipcc - // let the compiler decide (no pragma unroll) - for (int idim = 0; idim < DIM; idim++) { - __syncthreads(); - // read A - read_A_notrans_g2r_1D_nosync(tx, dA, ldda, NULL, 0, rA); - - // read B - read_B_g2s_1D_nosync(tx, myn, dB, lddb, sB, sldb); - __syncthreads(); - - mul_rAsBrC_1D_nosync(tx, alpha, rA, sB, sldb, rC); - - // advance A and B - dA += P * Q; - dB += Q * n; - } - write_C_r2g_1D_nosync(tx, myn, rC, dC, lddc); -} - -#endif // CEED_MAGMA_GRAD_NONTENSOR_H diff --git a/include/ceed/jit-source/magma/interp-1d.h b/include/ceed/jit-source/magma/interp-1d.h deleted file mode 100644 index 3ca89e3c92..0000000000 --- a/include/ceed/jit-source/magma/interp-1d.h +++ /dev/null @@ -1,129 +0,0 @@ -// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -// macros to abstract access of shared memory and reg. file -#define sT(i, j) sT[(j)*P_ + (i)] - -////////////////////////////////////////////////////////////////////////////////////////// -// interp basis action (1D) -template -static __device__ __inline__ void magma_interp_1d_device(const T *sT, magma_trans_t transT, T *sU[NCOMP_], T *sV[NCOMP_], const int tx) { - // Assumptions - // 1. 1D threads of size max(P_,Q_) - // 2. sU[i] is 1xP_: in shared memory - // 3. sV[i] is 1xQ_: in shared memory - // 4. P_roduct per component is one row (1xP_) times T matrix (P_xQ_) => one row (1xQ_) - // 5. Each thread computes one entry in sV[i] - // 6. Must sync before and after call - // 7. Note that the layout for U and V is different from 2D/3D problem - - T rv; - if (tx < Q_) { - for (int icomp = 0; icomp < NCOMP_; icomp++) { - rv = (transT == MagmaTrans) ? sV[icomp][tx] : 0.0; - for (int i = 0; i < P_; i++) { - rv += sU[icomp][i] * sT(i, tx); // sT[tx * P_ + i]; - } - sV[icomp][tx] = rv; - } - } -} - -////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_1D)) __global__ - void magma_interpn_1d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, - const int cstrdV, const int nelem) { - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaNoTrans; - - if (elem_id >= nelem) return; - - CeedScalar *sU[NCOMP]; - CeedScalar *sV[NCOMP]; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar *sT = (CeedScalar *)(shared_data); - CeedScalar *sW = sT + P * Q; - sU[0] = sW + ty * NCOMP * (P + Q); - sV[0] = sU[0] + (NCOMP * 1 * P); - for (int icomp = 1; icomp < NCOMP; icomp++) { - sU[icomp] = sU[icomp - 1] + (1 * P); - sV[icomp] = sV[icomp - 1] + (1 * Q); - } - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dT, sT); - } - - // read U - read_1d(dU, cstrdU, sU, tx); - - __syncthreads(); - magma_interp_1d_device(sT, transT, sU, sV, tx); - __syncthreads(); - - // write V - write_1d(sV, dV, cstrdV, tx); -} - -////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_1D)) __global__ - void magma_interpt_1d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, - const int cstrdV, const int nelem) { - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaTrans; - - if (elem_id >= nelem) return; - - CeedScalar *sU[NCOMP]; - CeedScalar *sV[NCOMP]; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar *sT = (CeedScalar *)(shared_data); - CeedScalar *sW = sT + Q * P; - sU[0] = sW + ty * NCOMP * (Q + P); - sV[0] = sU[0] + (NCOMP * 1 * Q); - for (int icomp = 1; icomp < NCOMP; icomp++) { - sU[icomp] = sU[icomp - 1] + (1 * Q); - sV[icomp] = sV[icomp - 1] + (1 * P); - } - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dT, sT); - } - - // read U - read_1d(dU, cstrdU, sU, tx); - - // read V - read_1d(dV, cstrdV, sV, tx); - - __syncthreads(); - magma_interp_1d_device(sT, transT, sU, sV, tx); - __syncthreads(); - - // write V - write_1d(sV, dV, cstrdV, tx); -} diff --git a/include/ceed/jit-source/magma/interp-2d.h b/include/ceed/jit-source/magma/interp-2d.h deleted file mode 100644 index 901128baa2..0000000000 --- a/include/ceed/jit-source/magma/interp-2d.h +++ /dev/null @@ -1,146 +0,0 @@ -// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -// macros to abstract access of shared memory and reg. file -#define sT(i, j) sT[(j)*P_ + (i)] -#define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)] - -////////////////////////////////////////////////////////////////////////////////////////// -// interp basis action (2D) -template -static __device__ __inline__ void magma_interp_2d_device(const T *sT, magma_trans_t transT, T rU[DIM_U][NCOMP_][rUsize], T rV[DIM_V][NCOMP_][rVsize], - const int tx, T rTmp, T *swork) { - // Assumptions - // 1. 1D threads of size max(P_,Q_) - // 2. input: rU[DIM_U x NCOMP_ x rUsize] in registers (per thread) - // 3. output: rV[DIM_V x NCOMP_ x rVsize] in registers (per thread) - // 4. Two products per component - // 4.1 Batch P_ of (1xP_) matrices times (P_xQ_) matrix => Batch P_ of (1xQ_) matrices - // 4.2 Batch 1 of (Q_xP_) matrix times (P_xQ_) matrix => (Q_xQ_) matrix - // 5. Each thread computes one row of the output of each product - // 6. Sync is recommended before and after the call - - for (int icomp = 0; icomp < NCOMP_; icomp++) { - // 1st product -- Batch P_ of (1xP_) matrices [reg] x (P_xQ_) [shmem] => Batch P_ of (1xQ_) matrices - // the batch output P_ x (1xQ_) is written on the fly to shmem - if (tx < P_) { - const int batchid = tx; - const int sld = 1; - T *sTmp = swork + batchid * (1 * Q_); - for (int j = 0; j < Q_; j++) { - rTmp = 0.0; - for (int i = 0; i < P_; i++) { - rTmp += rU[0][icomp][i] * sT(i, j); - } - sTmp(0, j, sld) = rTmp; - } - } // end of: if (tx < P_) - __syncthreads(); - - // 2nd product -- Batch 1 of a (Q_xP_) matrix [shmem] x (P_xQ_) [shmem] => (Q_xQ_) matrix [reg] - if (tx < Q_) { - const int batchid = 0; - const int sld = Q_; - T *sTmp = swork + batchid * (Q_ * P_); - for (int j = 0; j < Q_; j++) { - rTmp = 0.0; - for (int i = 0; i < P_; i++) { - rTmp += sTmp(tx, i, sld) * sT(i, j); - } - rV[0][icomp][j] += rTmp; - } - } - __syncthreads(); - } -} - -////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_2D)) __global__ - void magma_interpn_2d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, - const int cstrdV, const int nelem) { - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaNoTrans; - - if (elem_id >= nelem) return; - - CeedScalar rU[1][NCOMP][P] = {0.0}; // for a non fused operator DIM is always 1 - CeedScalar rV[1][NCOMP][Q] = {0.0}; // for a non fused operator DIM is always 1 - CeedScalar rTmp = 0.0; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar *sT = (CeedScalar *)(shared_data); - CeedScalar *sTmp = sT + P * Q; - sTmp += ty * (P * MAXPQ); - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dT, sT); - } - - // read U -- there is a sync at the end of this function - readU_2d(dU, cstrdU, rU, sTmp, tx); - - // no sync needed here -- readU_2d already syncs at the end - magma_interp_2d_device(sT, transT, rU, rV, tx, rTmp, sTmp); - __syncthreads(); - - // write V - writeV_2d(dV, cstrdV, rV, tx); -} - -////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_2D)) __global__ - void magma_interpt_2d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, - const int cstrdV, const int nelem) { - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaTrans; - - if (elem_id >= nelem) return; - - CeedScalar rU[1][NCOMP][Q] = {0.0}; // for a non fused operator DIM is always 1 - CeedScalar rV[1][NCOMP][P] = {0.0}; // for a non fused operator DIM is always 1 - CeedScalar rTmp = 0.0; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar *sT = (CeedScalar *)(shared_data); - CeedScalar *sTmp = sT + Q * P; - sTmp += ty * (Q * MAXPQ); - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dT, sT); - } - - // read V - readV_2d(dV, cstrdV, rV, tx); - - // read U -- there is a sync at the end of this function - readU_2d(dU, cstrdU, rU, sTmp, tx); - - // no sync needed here -- readU_2d already syncs at the end - magma_interp_2d_device(sT, transT, rU, rV, tx, rTmp, sTmp); - __syncthreads(); - - // write V - writeV_2d(dV, cstrdV, rV, tx); -} diff --git a/include/ceed/jit-source/magma/interp-3d.h b/include/ceed/jit-source/magma/interp-3d.h deleted file mode 100644 index a886910a3d..0000000000 --- a/include/ceed/jit-source/magma/interp-3d.h +++ /dev/null @@ -1,174 +0,0 @@ -// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -// macros to abstract access of shared memory and reg. file -#define sT(i, j) sT[(j)*P_ + (i)] -#define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)] - -////////////////////////////////////////////////////////////////////////////////////////// -// interp basis action (3D) -template -static __device__ __inline__ void magma_interp_3d_device(const T *sT, magma_trans_t transT, T rU[DIM_U][NCOMP_][rUsize], T rV[DIM_V][NCOMP_][rVsize], - const int tx, T rTmp[Q_], T *swork) { - // Assumptions - // 1. 1D threads of size max(P_,Q_)^2 - // 2. input: rU[DIM_U x NCOMP_ x rUsize] in registers (per thread) - // 3. output: rV[DIM_V x NCOMP_ x rVsize] in registers (per thread) - // 4. Three products per component - // 4.1 Batch P_^2 of (1xP_) matrices times (P_xQ_) matrix => Batch P_^2 of (1xQ_) matrices - // 4.2 Batch P_ of (Q_xP_) matrices times (P_xQ_) matrix => Batch P_ of (Q_xQ_) matrices - // 4.3 Batch 1 of (Q_^2xP_) matrix times (P_xQ_) matrix => (Q_^2xQ_) matrix - // 5. Each thread computes one row of the output of each product - // 6. Sync is recommended before and after the call - - for (int icomp = 0; icomp < NCOMP_; icomp++) { - // Batch P_^2 of (1xP_) matrices [reg] times (P_xQ_) matrix [shmem] => Batch P_^2 of (1xQ_) matrices [shmem] - if (tx < (P_ * P_)) { - const int batchid = tx; - const int sld = 1; - T *sTmp = swork + batchid * (1 * Q_); - for (int j = 0; j < Q_; j++) { - rTmp[0] = 0.0; - for (int i = 0; i < P_; i++) { - rTmp[0] += rU[0][icomp][i] * sT(i, j); - } - sTmp(0, j, sld) = rTmp[0]; - } - } // end of: if (tx < P_*P_) - __syncthreads(); - - // Batch P_ of (Q_xP_) matrices [shmem] times (P_xQ_) matrix [shmem] => Batch P_ of (Q_xQ_) matrices [reg] - if (tx < (P_ * Q_)) { - const int batchid = tx / Q_; - const int tx_ = tx % Q_; - const int sld = Q_; - T *sTmp = swork + batchid * (Q_ * P_); // sTmp is input - for (int j = 0; j < Q_; j++) { - rTmp[j] = 0.0; - for (int i = 0; i < P_; i++) { - rTmp[j] += sTmp(tx_, i, sld) * sT(i, j); - } - } - } - __syncthreads(); - - // write rTmp[] into shmem as batch P_ of Q_xQ_ matrices - if (tx < (P_ * Q_)) { - const int batchid = tx / Q_; - const int tx_ = tx % Q_; - const int sld = Q_; - T *sTmp = swork + batchid * (Q_ * Q_); - for (int j = 0; j < Q_; j++) { - sTmp(tx_, j, sld) = rTmp[j]; - } - } - __syncthreads(); - - // Batch 1 of (Q_^2xP_) matrices [shmem] times (P_xQ_) matrix [shmem] => Batch 1 of (Q_^2xQ_) matrices [reg] - if (tx < (Q_ * Q_)) { - // No need to declare batchid = (tx / Q_^2) = always zero - // No need to declare tx_ = (tx_ % Q_^2) = always tx - const int sld = Q_ * Q_; - T *sTmp = swork; - for (int j = 0; j < Q_; j++) { - rTmp[0] = 0.0; - for (int i = 0; i < P_; i++) { - rTmp[0] += sTmp(tx, i, sld) * sT(i, j); - } - rV[0][icomp][j] += rTmp[0]; - } - } - __syncthreads(); - } -} - -////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ *MAXPQ, MAGMA_MAXTHREADS_3D)) __global__ - void magma_interpn_3d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, - const int cstrdV, const int nelem) { - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaNoTrans; - - if (elem_id >= nelem) return; - - CeedScalar rU[1][NCOMP][P] = {0.0}; // for a non fused operator DIM is always 1 - CeedScalar rV[1][NCOMP][Q] = {0.0}; // for a non fused operator DIM is always 1 - CeedScalar rTmp[Q] = {0.0}; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar *sT = (CeedScalar *)(shared_data); - CeedScalar *sTmp = sT + P * Q; - sTmp += ty * (max(P * P * MAXPQ, P * Q * Q)); - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dT, sT); - } - - // read U (idim = 0 for dU, iDIM = 0 for rU, u_dimstride is always 0) - readU_3d(dU, cstrdU, rU, sTmp, tx); - // there is a sync at the end of this function - - magma_interp_3d_device(sT, transT, rU, rV, tx, rTmp, sTmp); - __syncthreads(); - - // write V - writeV_3d(dV, cstrdV, rV, tx); -} - -////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ *MAXPQ, MAGMA_MAXTHREADS_3D)) __global__ - void magma_interpt_3d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, - const int cstrdV, const int nelem) { - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaTrans; - - if (elem_id >= nelem) return; - - CeedScalar rU[1][NCOMP][Q] = {0.0}; // for a non fused operator DIM is always 1 - CeedScalar rV[1][NCOMP][P] = {0.0}; // for a non fused operator DIM is always 1 - CeedScalar rTmp[P] = {0.0}; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar *sT = (CeedScalar *)(shared_data); - CeedScalar *sTmp = sT + Q * P; - sTmp += ty * (max(Q * Q * MAXPQ, Q * P * P)); - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dT, sT); - } - - // read V - readV_3d(dV, cstrdV, rV, tx); - - // read U (idim = 0 for dU, iDIM = 0 for rU, u_dimstride is always 0) - readU_3d(dU, cstrdU, rU, sTmp, tx); - // there is a sync at the end of this function - - magma_interp_3d_device(sT, transT, rU, rV, tx, rTmp, sTmp); - __syncthreads(); - - // write V - writeV_3d(dV, cstrdV, rV, tx); -} diff --git a/include/ceed/jit-source/magma/interp-nontensor.h b/include/ceed/jit-source/magma/interp-nontensor.h deleted file mode 100644 index e715986a74..0000000000 --- a/include/ceed/jit-source/magma/interp-nontensor.h +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_MAGMA_INTERP_NONTENSOR_H -#define CEED_MAGMA_INTERP_NONTENSOR_H - -//////////////////////////////////////////////////////////////////////////////// -extern "C" __global__ __launch_bounds__(Q *MAGMA_NONTENSOR_BASIS_NTCOL(Q)) void magma_interp_nontensor_n( - magma_trans_t transA, magma_trans_t transB, int n, const CeedScalar alpha, CeedScalar const *dA, int ldda, CeedScalar const *dB, int lddb, - const CeedScalar beta, CeedScalar *dC, int lddc) { - MAGMA_DEVICE_SHARED(CeedScalar, shared_data); - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int bx = blockIdx.x; - const int id = bx * blockDim.y + ty; - const int nblocks = MAGMA_CEILDIV(n, NB_INTERP_N); - const int myn = min(NB_INTERP_N, n - id * NB_INTERP_N); - - // const bool irrblock = ( myn != NB_INTERP_N ); - - dB += id * NB_INTERP_N * lddb; - dC += id * NB_INTERP_N * lddc; - - const int slda = P; - const int sldb = P; - CeedScalar *sA = (CeedScalar *)(shared_data); - CeedScalar *sB = sA; - sB += ty * sldb * NB_INTERP_N; - - // read A using all threads - CeedScalar rA[P] = {MAGMA_D_ZERO}; - read_A_trans_g2r_1D_nosync(tx, ty, dA, ldda, sA, slda, rA); - __syncthreads(); - - // terminate threads with no work - if (id >= nblocks) return; - - // init rC - CeedScalar rC[NB_INTERP_N] = {MAGMA_D_ZERO}; - read_C_g2r_1D_nosync(tx, myn, dC, lddc, beta, rC); - - // read B - read_B_g2s_1D_nosync(tx, myn, dB, lddb, sB, sldb); - __syncthreads(); - - mul_rAsBrC_1D_nosync(tx, alpha, rA, sB, sldb, rC); - write_C_r2g_1D_nosync(tx, myn, rC, dC, lddc); -} - -//////////////////////////////////////////////////////////////////////////////// -extern "C" __global__ __launch_bounds__(P *MAGMA_NONTENSOR_BASIS_NTCOL(P)) void magma_interp_nontensor_t( - magma_trans_t transA, magma_trans_t transB, int n, const CeedScalar alpha, CeedScalar const *dA, int ldda, CeedScalar const *dB, int lddb, - const CeedScalar beta, CeedScalar *dC, int lddc) { - MAGMA_DEVICE_SHARED(CeedScalar, shared_data); - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int bx = blockIdx.x; - const int id = bx * blockDim.y + ty; - const int nblocks = MAGMA_CEILDIV(n, NB_INTERP_T); - const int myn = min(NB_INTERP_T, n - id * NB_INTERP_T); - if (id >= nblocks) return; - - dB += id * NB_INTERP_T * lddb; - dC += id * NB_INTERP_T * lddc; - - // A is P x Q - const int sldb = Q; - CeedScalar *sB = (CeedScalar *)(shared_data); - sB += ty * sldb * NB_INTERP_T; - - // init rC - CeedScalar rC[NB_INTERP_T] = {MAGMA_D_ZERO}; - if (beta != MAGMA_D_ZERO) { - read_C_g2r_1D_nosync(tx, myn, dC, lddc, beta, rC); - } - - // read A - CeedScalar rA[Q] = {MAGMA_D_ZERO}; - read_A_notrans_g2r_1D_nosync(tx, dA, ldda, NULL, 0, rA); - - // read B - read_B_g2s_1D_nosync(tx, myn, dB, lddb, sB, sldb); - __syncthreads(); - - mul_rAsBrC_1D_nosync(tx, alpha, rA, sB, sldb, rC); - - write_C_r2g_1D_nosync(tx, myn, rC, dC, lddc); -} - -#endif // CEED_MAGMA_INTERP_NONTENSOR_H \ No newline at end of file diff --git a/include/ceed/jit-source/magma/magma-basis-grad-1d.h b/include/ceed/jit-source/magma/magma-basis-grad-1d.h new file mode 100644 index 0000000000..89a112115e --- /dev/null +++ b/include/ceed/jit-source/magma/magma-basis-grad-1d.h @@ -0,0 +1,138 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for MAGMA tensor basis gradient in 1D +#ifndef CEED_MAGMA_BASIS_GRAD_1D_H +#define CEED_MAGMA_BASIS_GRAD_1D_H + +#include "magma-common-tensor.h" + +// macros to abstract access of shared memory and reg. file +#define sT(i, j) sT[(j)*P + (i)] + +////////////////////////////////////////////////////////////////////////////////////////// +// grad basis action (1D) +template +static __device__ __inline__ void magma_grad_1d_device(const T *sT, magma_trans_t transT, T *sU[NUM_COMP], T *sV[NUM_COMP], const int tx) { + // Assumptions + // 1. 1D threads of size max(P,Q) + // 2. sU[i] is 1xP: in shared memory + // 3. sV[i] is 1xQ: in shared memory + // 4. P_roduct per component is one row (1xP) times T matrix (PxQ) => one row (1xQ) + // 5. Each thread computes one entry in sV[i] + // 6. Must sync before and after call + // 7. Note that the layout for U and V is different from 2D/3D problem + + T rv; + if (tx < Q) { + for (int comp = 0; comp < NUM_COMP; comp++) { + rv = (transT == MagmaTrans) ? sV[comp][tx] : 0.0; + for (int i = 0; i < P; i++) { + rv += sU[comp][i] * sT(i, tx); + } + sV[comp][tx] = rv; + } + } +} + +////////////////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__ + void magma_gradn_1d_kernel(const CeedScalar *dTinterp, const CeedScalar *dTgrad, const CeedScalar *dU, const int estrdU, const int cstrdU, + const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaNoTrans; + + if (elem_id >= nelem) return; + + CeedScalar *sU[BASIS_NUM_COMP]; + CeedScalar *sV[BASIS_NUM_COMP]; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sT = (CeedScalar *)shared_data; + CeedScalar *sW = sT + BASIS_P * BASIS_Q; + sU[0] = sW + ty * BASIS_NUM_COMP * (BASIS_P + BASIS_Q); + sV[0] = sU[0] + (BASIS_NUM_COMP * 1 * BASIS_P); + for (int comp = 1; comp < BASIS_NUM_COMP; comp++) { + sU[comp] = sU[comp - 1] + (1 * BASIS_P); + sV[comp] = sV[comp - 1] + (1 * BASIS_Q); + } + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dTgrad, sT); + } + + // read U + read_1d(dU, cstrdU, sU, tx); + + __syncthreads(); + magma_grad_1d_device(sT, transT, sU, sV, tx); + __syncthreads(); + + // write V + write_1d(sV, dV, cstrdV, tx); +} + +////////////////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__ + void magma_gradt_1d_kernel(const CeedScalar *dTinterp, const CeedScalar *dTgrad, const CeedScalar *dU, const int estrdU, const int cstrdU, + const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaTrans; + + if (elem_id >= nelem) return; + + CeedScalar *sU[BASIS_NUM_COMP]; + CeedScalar *sV[BASIS_NUM_COMP]; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sT = (CeedScalar *)shared_data; + CeedScalar *sW = sT + BASIS_Q * BASIS_P; + sU[0] = sW + ty * BASIS_NUM_COMP * (BASIS_Q + BASIS_P); + sV[0] = sU[0] + (BASIS_NUM_COMP * 1 * BASIS_Q); + for (int comp = 1; comp < BASIS_NUM_COMP; comp++) { + sU[comp] = sU[comp - 1] + (1 * BASIS_Q); + sV[comp] = sV[comp - 1] + (1 * BASIS_P); + } + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dTgrad, sT); + } + + // read U + read_1d(dU, cstrdU, sU, tx); + + // read V + read_1d(dV, cstrdV, sV, tx); + + __syncthreads(); + magma_grad_1d_device(sT, transT, sU, sV, tx); + __syncthreads(); + + // write V + write_1d(sV, dV, cstrdV, tx); +} + +#endif // CEED_MAGMA_BASIS_GRAD_1D_H diff --git a/include/ceed/jit-source/magma/magma-basis-grad-2d.h b/include/ceed/jit-source/magma/magma-basis-grad-2d.h new file mode 100644 index 0000000000..042e41b046 --- /dev/null +++ b/include/ceed/jit-source/magma/magma-basis-grad-2d.h @@ -0,0 +1,189 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for MAGMA tensor basis gradient in 2D +#ifndef CEED_MAGMA_BASIS_GRAD_2D_H +#define CEED_MAGMA_BASIS_GRAD_2D_H + +#include "magma-common-tensor.h" + +// macros to abstract access of shared memory and reg. file +#define sT(i, j) sT[(j)*P + (i)] +#define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)] + +////////////////////////////////////////////////////////////////////////////////////////// +// grad basis action (2D) +// This function is called two times at a higher level for 2D +// DIM_U -- for the size of rU[DIM_U * NUM_COMP * MAX_P_Q] +// DIM_V -- for the size of rV[DIM_V * NUM_COMP * MAX_P_Q] +// i_DIM -- the index of the outermost loop over dimensions in grad +// i_DIM_U -- which dim index of rU is accessed (always 0 for notrans, 0 or 1 for trans) +// i_DIM_V -- which dim index of rV is accessed (0 or 1 for notrans, always 0 for trans) +// the scalar beta is used to specify whether to accumulate to rV, or overwrite it +template +static __device__ __inline__ void magma_grad_2d_device(const T *sTinterp, const T *sTgrad, T rU[DIM_U][NUM_COMP][rU_SIZE], + T rV[DIM_V][NUM_COMP][rV_SIZE], T beta, const int tx, T rTmp, T *swork) { + // Assumptions + // 0. This device routine applies grad for one dim only (i_DIM), so it should be called twice for 2D + // 1. 1D threads of size max(P,Q) + // 2. input: rU[DIM_U x NUM_COMP x P] in registers (per thread) + // 3. output: rV[DIM_V x NUM_COMP x Q] in registers (per thread) + // 4. Two products per each (dim,component) pair + // 4.1 Batch P of (1xP) matrices times (PxQ) matrix => Batch P of (1xQ) matrices + // 4.2 Batch 1 of (QxP) matrix times (PxQ) matrix => (QxQ) matrix + // 6. Each thread computes one row of the output of each product + // 7. Sync is recommended before and after the call + + for (int comp = 0; comp < NUM_COMP; comp++) { + // 1st product -- Batch P of (1xP) matrices [reg] x (PxQ) [shmem] => Batch P of (1xQ) matrices + // the batch output P x (1xQ) is written on the fly to shmem + if (tx < P) { + const int batchid = tx; + const int sld = 1; + const T *sT = (i_DIM == 0) ? sTgrad : sTinterp; + T *sTmp = swork + batchid * (1 * Q); + for (int j = 0; j < Q; j++) { + rTmp = 0.0; + for (int i = 0; i < P; i++) { + rTmp += rU[i_DIM_U][comp][i] * sT(i, j); + } + sTmp(0, j, sld) = rTmp; + } + } // end of: if (tx < P) + __syncthreads(); + + // 2nd product -- Batch 1 of a (QxP) matrix [shmem] x (PxQ) [shmem] => (QxQ) matrix [reg] + if (tx < Q) { + const int batchid = 0; + const int sld = Q; + const T *sT = (i_DIM == 1) ? sTgrad : sTinterp; + T *sTmp = swork + batchid * (Q * P); + for (int j = 0; j < Q; j++) { + rTmp = 0.0; + for (int i = 0; i < P; i++) { + rTmp += sTmp(tx, i, sld) * sT(i, j); + } + rV[i_DIM_V][comp][j] *= beta; + rV[i_DIM_V][comp][j] += rTmp; + } + } + __syncthreads(); + } // loop over NUM_COMP +} + +////////////////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__ + void magma_gradn_2d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU, + const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaNoTrans; + + if (elem_id >= nelem) return; + + CeedScalar rU[1][BASIS_NUM_COMP][BASIS_P] = {0.0}; // here DIM_U = 1, but might be different for a fused operator + CeedScalar rV[1][BASIS_NUM_COMP][BASIS_Q] = {0.0}; // here DIM_V = 1, but might be different for a fused operator + CeedScalar rTmp = 0.0; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sTinterp = (CeedScalar *)shared_data; + CeedScalar *sTgrad = sTinterp + BASIS_P * BASIS_Q; + CeedScalar *sTmp = sTgrad + BASIS_P * BASIS_Q; + sTmp += ty * (BASIS_P * BASIS_MAX_P_Q); + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dinterp1d, sTinterp); + dread_T_gm2sm(tx, transT, dgrad1d, sTgrad); + } + + // No need to read V ( required only in transposed grad ) + const CeedScalar beta = 0.0; + + /* read U (idim = 0 for dU, i_DIM = 0 for rU) -- + there is a sync at the end of this function */ + readU_2d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); + + /* first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) -- + output from rV[0][][] into dV (idim = 0) */ + magma_grad_2d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_2d_device */ + writeV_2d(dV + (0 * dstrdV), cstrdV, rV, tx); + + /* second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) -- + output from rV[0][][] into dV (idim = 1) */ + magma_grad_2d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_2d_device */ + writeV_2d(dV + (1 * dstrdV), cstrdV, rV, tx); +} + +////////////////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__ + void magma_gradt_2d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU, + const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaTrans; + + if (elem_id >= nelem) return; + + CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0}; // here DIM_U = 1, but might be different for a fused operator + CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0}; // here DIM_V = 1, but might be different for a fused operator + CeedScalar rTmp = 0.0; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sTinterp = (CeedScalar *)shared_data; + CeedScalar *sTgrad = sTinterp + BASIS_Q * BASIS_P; + CeedScalar *sTmp = sTgrad + BASIS_Q * BASIS_P; + sTmp += ty * (BASIS_Q * BASIS_MAX_P_Q); + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dinterp1d, sTinterp); + dread_T_gm2sm(tx, transT, dgrad1d, sTgrad); + } + __syncthreads(); + + /* read V (since this is transposed mode -- + idim = 0 for dV, i_DIM = 0 for rV) */ + const CeedScalar beta = 1.0; + readV_2d(dV + (0 * dstrdV), cstrdV, rV, tx); + + /* read U (idim = 0 for dU, i_DIM = 0 for rU) -- + there is a sync at the end of this function */ + readU_2d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); + /* first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) */ + magma_grad_2d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_2d_device */ + + /* read U (idim = 1 for dU, i_DIM = 0 for rU) -- + there is a sync at the end of this function */ + readU_2d(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx); + /* second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) */ + magma_grad_2d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_2d_device */ + + // write V + writeV_2d(dV + (0 * dstrdV), cstrdV, rV, tx); +} + +#endif // CEED_MAGMA_BASIS_GRAD_2D_H diff --git a/include/ceed/jit-source/magma/magma-basis-grad-3d.h b/include/ceed/jit-source/magma/magma-basis-grad-3d.h new file mode 100644 index 0000000000..063ee7bc0d --- /dev/null +++ b/include/ceed/jit-source/magma/magma-basis-grad-3d.h @@ -0,0 +1,224 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for MAGMA tensor basis gradient in 3D +#ifndef CEED_MAGMA_BASIS_GRAD_3D_H +#define CEED_MAGMA_BASIS_GRAD_3D_H + +#include "magma-common-tensor.h" + +// macros to abstract access of shared memory and reg. file +#define sT(i, j) sT[(j)*P + (i)] +#define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)] +#define sTmp2(i, j, ldw) sTmp2[(j) * (ldw) + (i)] + +////////////////////////////////////////////////////////////////////////////////////////// +// grad basis action (3D) +// This function is called three times at a higher level for 3D +// DIM_U -- for the size of rU[DIM_U * NUM_COMP * MAX_P_Q] +// DIM_V -- for the size of rV[DIM_V * NUM_COMP * MAX_P_Q] +// i_DIM -- the index of the outermost loop over dimensions in grad +// i_DIM_U -- which dim index of rU is accessed (always 0 for notrans, 0, 1, or 2 for trans) +// i_DIM_V -- which dim index of rV is accessed (0, 1, or 2 for notrans, always 0 for trans) +// the scalar beta is used to specify whether to accumulate to rV, or overwrite it +template +static __device__ __inline__ void magma_grad_3d_device(const T *sTinterp, const T *sTgrad, T rU[DIM_U][NUM_COMP][rU_SIZE], + T rV[DIM_V][NUM_COMP][rV_SIZE], T beta, const int tx, T rTmp, T *swork) { + // Assumptions + // 0. This device routine applies grad for one dim only (i_DIM), so it should be thrice for 3D + // 1. 1D threads of size max(P,Q)^2 + // 2. input: rU[DIM_U x NUM_COMP x rU_SIZE] in registers (per thread) + // 3. output: rV[DIM_V x NUM_COMP x rV_SIZE] in registers (per thread) + // 4. Three products per each (dim,component) pair + // 4.1 Batch P^2 of (1xP) matrices times (PxQ) matrix => Batch P^2 of (1xQ) matrices + // 4.2 Batch P of (QxP) matrices times (PxQ) matrix => Batch P of (QxQ) matrices + // 4.3 Batch 1 of (Q^2xP_) matrix times (PxQ) matrix => (Q^2xQ_) matrix + // 6. Each thread computes one row of the output of each product + // 7. Sync is recommended before and after the call + + T *sW1 = swork; + T *sW2 = sW1 + P * P * Q; + for (int comp = 0; comp < NUM_COMP; comp++) { + // Batch P^2 of (1xP) matrices [reg] times (PxQ) matrix [shmem] => Batch P^2 of (1xQ) matrices [shmem] + if (tx < (P * P)) { + const int batchid = tx; + const int sld = 1; + const T *sT = (i_DIM == 0) ? sTgrad : sTinterp; + T *sTmp = sW1 + batchid * (1 * Q); + for (int j = 0; j < Q; j++) { + rTmp = 0.0; + for (int i = 0; i < P; i++) { + rTmp += rU[i_DIM_U][comp][i] * sT(i, j); + } + sTmp(0, j, sld) = rTmp; + } + } // end of: if (tx < P*P) + __syncthreads(); + + // Batch P of (QxP) matrices [shmem] times (PxQ) matrix [shmem] => Batch P of (QxQ) matrices [reg] + if (tx < (P * Q)) { + const int batchid = tx / Q; + const int tx_ = tx % Q; + const int sld = Q; + const T *sT = (i_DIM == 1) ? sTgrad : sTinterp; + T *sTmp = sW1 + batchid * (Q * P); // sTmp is input + T *sTmp2 = sW2 + batchid * (Q * Q); // sTmp2 is output + for (int j = 0; j < Q; j++) { + rTmp = 0.0; + for (int i = 0; i < P; i++) { + rTmp += sTmp(tx_, i, sld) * sT(i, j); + } + sTmp2(tx_, j, sld) = rTmp; + } + } + __syncthreads(); + + // Batch 1 of (Q^2xP_) matrices [shmem] times (PxQ) matrix [shmem] => Batch 1 of (Q^2xQ_) matrices [reg] + if (tx < (Q * Q)) { + // No need to declare batchid = (tx / Q^2) = always zero + // No need to declare tx_ = (tx_ % Q^2) = always tx + const int sld = Q * Q; + const T *sT = (i_DIM == 2) ? sTgrad : sTinterp; + T *sTmp = sW2; // sTmp is input + for (int j = 0; j < Q; j++) { + rTmp = 0.0; + for (int i = 0; i < P; i++) { + rTmp += sTmp(tx, i, sld) * sT(i, j); + } + rV[i_DIM_V][comp][j] *= beta; + rV[i_DIM_V][comp][j] += rTmp; + } + } + __syncthreads(); + } // loop over NUM_COMP +} + +////////////////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__ + void magma_gradn_3d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU, + const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaNoTrans; + + if (elem_id >= nelem) return; + + CeedScalar rU[1][BASIS_NUM_COMP][BASIS_P] = {0.0}; // here DIM_U = 1, but might be different for a fused operator + CeedScalar rV[1][BASIS_NUM_COMP][BASIS_Q] = {0.0}; // here DIM_V = 1, but might be different for a fused operator + CeedScalar rTmp = 0.0; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sTinterp = (CeedScalar *)shared_data; + CeedScalar *sTgrad = sTinterp + BASIS_P * BASIS_Q; + CeedScalar *sTmp = sTgrad + BASIS_P * BASIS_Q; + sTmp += ty * (max(BASIS_P * BASIS_P * BASIS_P, (BASIS_P * BASIS_P * BASIS_Q) + (BASIS_P * BASIS_Q * BASIS_Q))); + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dinterp1d, sTinterp); + dread_T_gm2sm(tx, transT, dgrad1d, sTgrad); + } + __syncthreads(); + + // No need to read V ( required only in transposed grad ) + const CeedScalar beta = 0.0; + + /* read U (idim = 0 for dU, i_DIM = 0 for rU) -- + there is a sync at the end of this function */ + readU_3d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); + + /* first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) -- + output from rV[0][][] into dV (idim = 0) */ + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_3d_device */ + writeV_3d(dV + (0 * dstrdV), cstrdV, rV, tx); + + /* second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) -- + output from rV[0][][] into dV (idim = 1) */ + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_3d_device */ + writeV_3d(dV + (1 * dstrdV), cstrdV, rV, tx); + + /* third call (i_DIM = 2, i_DIM_U = 0, i_DIM_V = 0) -- + output from rV[0][][] into dV (idim = 2) */ + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_3d_device */ + writeV_3d(dV + (2 * dstrdV), cstrdV, rV, tx); +} + +////////////////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__ + void magma_gradt_3d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU, + const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaTrans; + + if (elem_id >= nelem) return; + + CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0}; // here DIM_U = 1, but might be different for a fused operator + CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0}; // here DIM_V = 1, but might be different for a fused operator + CeedScalar rTmp = 0.0; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sTinterp = (CeedScalar *)shared_data; + CeedScalar *sTgrad = sTinterp + BASIS_Q * BASIS_P; + CeedScalar *sTmp = sTgrad + BASIS_Q * BASIS_P; + sTmp += ty * (max(BASIS_Q * BASIS_Q * BASIS_Q, (BASIS_Q * BASIS_Q * BASIS_P) + (BASIS_Q * BASIS_P * BASIS_P))); + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dinterp1d, sTinterp); + dread_T_gm2sm(tx, transT, dgrad1d, sTgrad); + } + __syncthreads(); + + // read V (since this is transposed mode) + const CeedScalar beta = 1.0; + readV_3d(dV + (0 * dstrdV), cstrdV, rV, tx); + + /* read U (idim = 0 for dU, i_DIM = 0 for rU) -- + there is a sync at the end of this function */ + readU_3d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); + /* then first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) */ + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_3d_device */ + + /* read U (idim = 1 for dU, i_DIM = 0 for rU) -- + there is a sync at the end of this function */ + readU_3d(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx); + /* then second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) */ + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_3d_device */ + + /* read U (idim = 2 for dU, i_DIM = 0 for rU) -- + there is a sync at the end of this function */ + readU_3d(dU + (2 * dstrdU), cstrdU, rU, sTmp, tx); + /* then third call (i_DIM = 2, i_DIM_U = 0, i_DIM_V = 0) */ + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_3d_device */ + + // write V + writeV_3d(dV + (0 * dstrdV), cstrdV, rV, tx); +} + +#endif // CEED_MAGMA_BASIS_GRAD_3D_H diff --git a/include/ceed/jit-source/magma/magma-basis-grad-nontensor.h b/include/ceed/jit-source/magma/magma-basis-grad-nontensor.h new file mode 100644 index 0000000000..95bf548fc3 --- /dev/null +++ b/include/ceed/jit-source/magma/magma-basis-grad-nontensor.h @@ -0,0 +1,110 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for Magma non-tensor basis gradient +#ifndef CEED_MAGMA_GRAD_NONTENSOR_H +#define CEED_MAGMA_GRAD_NONTENSOR_H + +#include "magma-common-nontensor.h" + +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ + void magma_grad_nontensor_n(int n, CeedScalar const *dA, int ldda, CeedScalar const *dB, int lddb, CeedScalar *dC, int lddc) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data); + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int id = blockIdx.x * blockDim.y + ty; + const int nblocks = MAGMA_CEILDIV(n, BASIS_NB_GRAD_N); + const int myn = min(BASIS_NB_GRAD_N, n - id * BASIS_NB_GRAD_N); + + dB += id * BASIS_NB_GRAD_N * lddb; + dC += id * BASIS_NB_GRAD_N * lddc; + + // A is BASIS_P x BASIS_Q + const int slda = BASIS_P; + const int sldb = BASIS_P; + CeedScalar *sA = (CeedScalar *)shared_data; + CeedScalar *sB = sA + BASIS_Q * BASIS_P; + sB += ty * sldb * BASIS_NB_GRAD_N; + + // read B once for all C's + if (id < nblocks) { + read_B_g2s_1D_nosync(tx, myn, dB, lddb, sB, sldb); + } + + // unrolling this loop yields dramatic performance drop using hipcc, let the compiler decide (no pragma unroll) + for (int d = 0; d < BASIS_DIM; d++) { + // read A (BASIS_P x BASIS_Q) using all threads + CeedScalar rA[BASIS_P] = {MAGMA_D_ZERO}; + __syncthreads(); + read_A_trans_g2r_1D_nosync(tx, ty, dA, ldda, sA, slda, rA); + + // init rC + CeedScalar rC[BASIS_NB_GRAD_N] = {MAGMA_D_ZERO}; + if (id < nblocks) { + mul_rAsBrC_1D_nosync(tx, rA, sB, sldb, rC); + } + + // write C + if (id < nblocks) { + write_C_r2g_1D_nosync(tx, myn, rC, dC, lddc); + } + + dA += BASIS_Q * BASIS_P; + dC += BASIS_Q * n; + } +} + +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ + void magma_grad_nontensor_t(int n, CeedScalar const *dA, int ldda, CeedScalar const *dB, int lddb, CeedScalar *dC, int lddc) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data); + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int id = blockIdx.x * blockDim.y + ty; + const int nblocks = MAGMA_CEILDIV(n, BASIS_NB_GRAD_T); + const int myn = min(BASIS_NB_GRAD_T, n - id * BASIS_NB_GRAD_T); + + // terminate threads with no work + if (id >= nblocks) return; + + dB += id * BASIS_NB_GRAD_T * lddb; + dC += id * BASIS_NB_GRAD_T * lddc; + + // A is BASIS_P x BASIS_Q + const int sldb = BASIS_Q; + CeedScalar *sB = (CeedScalar *)shared_data; + sB += ty * sldb * BASIS_NB_GRAD_T; + + // init rA, rC + CeedScalar rA[BASIS_Q] = {MAGMA_D_ZERO}; + CeedScalar rC[BASIS_NB_GRAD_T] = {MAGMA_D_ZERO}; + + // unrolling this loop yields dramatic performance drop using hipcc, let the compiler decide (no pragma unroll) + for (int d = 0; d < BASIS_DIM; d++) { + // read A + read_A_notrans_g2r_1D_nosync(tx, dA, ldda, NULL, 0, rA); + + // read B + __syncthreads(); + read_B_g2s_1D_nosync(tx, myn, dB, lddb, sB, sldb); + __syncthreads(); + + addmul_rAsBrC_1D_nosync(tx, rA, sB, sldb, rC); + + dA += BASIS_P * BASIS_Q; + dB += BASIS_Q * n; + } + + // write C + write_C_r2g_1D_nosync(tx, myn, rC, dC, lddc); +} + +#endif // CEED_MAGMA_GRAD_NONTENSOR_H diff --git a/include/ceed/jit-source/magma/magma-basis-interp-1d.h b/include/ceed/jit-source/magma/magma-basis-interp-1d.h new file mode 100644 index 0000000000..074efd94b6 --- /dev/null +++ b/include/ceed/jit-source/magma/magma-basis-interp-1d.h @@ -0,0 +1,138 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for MAGMA tensor basis interpolation in 1D +#ifndef CEED_MAGMA_BASIS_INTERP_1D_H +#define CEED_MAGMA_BASIS_INTERP_1D_H + +#include "magma-common-tensor.h" + +// macros to abstract access of shared memory and reg. file +#define sT(i, j) sT[(j)*P + (i)] + +////////////////////////////////////////////////////////////////////////////////////////// +// interp basis action (1D) +template +static __device__ __inline__ void magma_interp_1d_device(const T *sT, magma_trans_t transT, T *sU[NUM_COMP], T *sV[NUM_COMP], const int tx) { + // Assumptions + // 1. 1D threads of size max(P,Q) + // 2. sU[i] is 1xP: in shared memory + // 3. sV[i] is 1xQ: in shared memory + // 4. P_roduct per component is one row (1xP) times T matrix (PxQ) => one row (1xQ) + // 5. Each thread computes one entry in sV[i] + // 6. Must sync before and after call + // 7. Note that the layout for U and V is different from 2D/3D problem + + T rv; + if (tx < Q) { + for (int comp = 0; comp < NUM_COMP; comp++) { + rv = (transT == MagmaTrans) ? sV[comp][tx] : 0.0; + for (int i = 0; i < P; i++) { + rv += sU[comp][i] * sT(i, tx); // sT[tx * P + i]; + } + sV[comp][tx] = rv; + } + } +} + +////////////////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__ + void magma_interpn_1d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, + const int cstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaNoTrans; + + if (elem_id >= nelem) return; + + CeedScalar *sU[BASIS_NUM_COMP]; + CeedScalar *sV[BASIS_NUM_COMP]; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sT = (CeedScalar *)shared_data; + CeedScalar *sW = sT + BASIS_P * BASIS_Q; + sU[0] = sW + ty * BASIS_NUM_COMP * (BASIS_P + BASIS_Q); + sV[0] = sU[0] + (BASIS_NUM_COMP * 1 * BASIS_P); + for (int comp = 1; comp < BASIS_NUM_COMP; comp++) { + sU[comp] = sU[comp - 1] + (1 * BASIS_P); + sV[comp] = sV[comp - 1] + (1 * BASIS_Q); + } + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dT, sT); + } + + // read U + read_1d(dU, cstrdU, sU, tx); + + __syncthreads(); + magma_interp_1d_device(sT, transT, sU, sV, tx); + __syncthreads(); + + // write V + write_1d(sV, dV, cstrdV, tx); +} + +////////////////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__ + void magma_interpt_1d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, + const int cstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaTrans; + + if (elem_id >= nelem) return; + + CeedScalar *sU[BASIS_NUM_COMP]; + CeedScalar *sV[BASIS_NUM_COMP]; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sT = (CeedScalar *)shared_data; + CeedScalar *sW = sT + BASIS_Q * BASIS_P; + sU[0] = sW + ty * BASIS_NUM_COMP * (BASIS_Q + BASIS_P); + sV[0] = sU[0] + (BASIS_NUM_COMP * 1 * BASIS_Q); + for (int comp = 1; comp < BASIS_NUM_COMP; comp++) { + sU[comp] = sU[comp - 1] + (1 * BASIS_Q); + sV[comp] = sV[comp - 1] + (1 * BASIS_P); + } + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dT, sT); + } + + // read U + read_1d(dU, cstrdU, sU, tx); + + // read V + read_1d(dV, cstrdV, sV, tx); + + __syncthreads(); + magma_interp_1d_device(sT, transT, sU, sV, tx); + __syncthreads(); + + // write V + write_1d(sV, dV, cstrdV, tx); +} + +#endif // CEED_MAGMA_BASIS_INTERP_1D_H diff --git a/include/ceed/jit-source/magma/magma-basis-interp-2d.h b/include/ceed/jit-source/magma/magma-basis-interp-2d.h new file mode 100644 index 0000000000..bb3475df51 --- /dev/null +++ b/include/ceed/jit-source/magma/magma-basis-interp-2d.h @@ -0,0 +1,155 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for MAGMA tensor basis interpolation in 1D +#ifndef CEED_MAGMA_BASIS_INTERP_2D_H +#define CEED_MAGMA_BASIS_INTERP_2D_H + +#include "magma-common-tensor.h" + +// macros to abstract access of shared memory and reg. file +#define sT(i, j) sT[(j)*P + (i)] +#define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)] + +////////////////////////////////////////////////////////////////////////////////////////// +// interp basis action (2D) +template +static __device__ __inline__ void magma_interp_2d_device(const T *sT, magma_trans_t transT, T rU[DIM_U][NUM_COMP][rU_SIZE], + T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx, T rTmp, T *swork) { + // Assumptions + // 1. 1D threads of size max(P,Q) + // 2. input: rU[DIM_U x NUM_COMP x rU_SIZE] in registers (per thread) + // 3. output: rV[DIM_V x NUM_COMP x rV_SIZE] in registers (per thread) + // 4. Two products per component + // 4.1 Batch P of (1xP) matrices times (PxQ) matrix => Batch P of (1xQ) matrices + // 4.2 Batch 1 of (QxP) matrix times (PxQ) matrix => (QxQ) matrix + // 5. Each thread computes one row of the output of each product + // 6. Sync is recommended before and after the call + + for (int comp = 0; comp < NUM_COMP; comp++) { + // 1st product -- Batch P of (1xP) matrices [reg] x (PxQ) [shmem] => Batch P of (1xQ) matrices + // the batch output P x (1xQ) is written on the fly to shmem + if (tx < P) { + const int batchid = tx; + const int sld = 1; + T *sTmp = swork + batchid * (1 * Q); + for (int j = 0; j < Q; j++) { + rTmp = 0.0; + for (int i = 0; i < P; i++) { + rTmp += rU[0][comp][i] * sT(i, j); + } + sTmp(0, j, sld) = rTmp; + } + } // end of: if (tx < P) + __syncthreads(); + + // 2nd product -- Batch 1 of a (QxP) matrix [shmem] x (PxQ) [shmem] => (QxQ) matrix [reg] + if (tx < Q) { + const int batchid = 0; + const int sld = Q; + T *sTmp = swork + batchid * (Q * P); + for (int j = 0; j < Q; j++) { + rTmp = 0.0; + for (int i = 0; i < P; i++) { + rTmp += sTmp(tx, i, sld) * sT(i, j); + } + rV[0][comp][j] += rTmp; + } + } + __syncthreads(); + } +} + +////////////////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__ + void magma_interpn_2d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, + const int cstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaNoTrans; + + if (elem_id >= nelem) return; + + CeedScalar rU[1][BASIS_NUM_COMP][BASIS_P] = {0.0}; // for a non-fused operator BASIS_DIM is always 1 + CeedScalar rV[1][BASIS_NUM_COMP][BASIS_Q] = {0.0}; // for a non-fused operator BASIS_DIM is always 1 + CeedScalar rTmp = 0.0; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sT = (CeedScalar *)shared_data; + CeedScalar *sTmp = sT + BASIS_P * BASIS_Q; + sTmp += ty * (BASIS_P * BASIS_MAX_P_Q); + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dT, sT); + } + + // read U -- there is a sync at the end of this function + readU_2d(dU, cstrdU, rU, sTmp, tx); + + // no sync needed here -- readU_2d already syncs at the end + magma_interp_2d_device(sT, transT, rU, rV, tx, rTmp, sTmp); + __syncthreads(); + + // write V + writeV_2d(dV, cstrdV, rV, tx); +} + +////////////////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__ + void magma_interpt_2d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, + const int cstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaTrans; + + if (elem_id >= nelem) return; + + CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0}; // for a non-fused operator BASIS_DIM is always 1 + CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0}; // for a non-fused operator BASIS_DIM is always 1 + CeedScalar rTmp = 0.0; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sT = (CeedScalar *)shared_data; + CeedScalar *sTmp = sT + BASIS_Q * BASIS_P; + sTmp += ty * (BASIS_Q * BASIS_MAX_P_Q); + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dT, sT); + } + + // read V + readV_2d(dV, cstrdV, rV, tx); + + // read U -- there is a sync at the end of this function + readU_2d(dU, cstrdU, rU, sTmp, tx); + + // no sync needed here -- readU_2d already syncs at the end + magma_interp_2d_device(sT, transT, rU, rV, tx, rTmp, sTmp); + __syncthreads(); + + // write V + writeV_2d(dV, cstrdV, rV, tx); +} + +#endif // CEED_MAGMA_BASIS_INTERP_2D_H diff --git a/include/ceed/jit-source/magma/magma-basis-interp-3d.h b/include/ceed/jit-source/magma/magma-basis-interp-3d.h new file mode 100644 index 0000000000..8f2fd3985e --- /dev/null +++ b/include/ceed/jit-source/magma/magma-basis-interp-3d.h @@ -0,0 +1,183 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for MAGMA tensor basis interpolation in 3D +#ifndef CEED_MAGMA_BASIS_INTERP_3D_H +#define CEED_MAGMA_BASIS_INTERP_3D_H + +#include "magma-common-tensor.h" + +// macros to abstract access of shared memory and reg. file +#define sT(i, j) sT[(j)*P + (i)] +#define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)] + +////////////////////////////////////////////////////////////////////////////////////////// +// interp basis action (3D) +template +static __device__ __inline__ void magma_interp_3d_device(const T *sT, magma_trans_t transT, T rU[DIM_U][NUM_COMP][rU_SIZE], + T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx, T rTmp[Q], T *swork) { + // Assumptions + // 1. 1D threads of size max(P,Q)^2 + // 2. input: rU[DIM_U x NUM_COMP x rU_SIZE] in registers (per thread) + // 3. output: rV[DIM_V x NUM_COMP x rV_SIZE] in registers (per thread) + // 4. Three products per component + // 4.1 Batch P^2 of (1xP) matrices times (PxQ) matrix => Batch P^2 of (1xQ) matrices + // 4.2 Batch P of (QxP) matrices times (PxQ) matrix => Batch P of (QxQ) matrices + // 4.3 Batch 1 of (Q^2xP_) matrix times (PxQ) matrix => (Q^2xQ_) matrix + // 5. Each thread computes one row of the output of each product + // 6. Sync is recommended before and after the call + + for (int comp = 0; comp < NUM_COMP; comp++) { + // Batch P^2 of (1xP) matrices [reg] times (PxQ) matrix [shmem] => Batch P^2 of (1xQ) matrices [shmem] + if (tx < (P * P)) { + const int batchid = tx; + const int sld = 1; + T *sTmp = swork + batchid * (1 * Q); + for (int j = 0; j < Q; j++) { + rTmp[0] = 0.0; + for (int i = 0; i < P; i++) { + rTmp[0] += rU[0][comp][i] * sT(i, j); + } + sTmp(0, j, sld) = rTmp[0]; + } + } // end of: if (tx < P*P) + __syncthreads(); + + // Batch P of (QxP) matrices [shmem] times (PxQ) matrix [shmem] => Batch P of (QxQ) matrices [reg] + if (tx < (P * Q)) { + const int batchid = tx / Q; + const int tx_ = tx % Q; + const int sld = Q; + T *sTmp = swork + batchid * (Q * P); // sTmp is input + for (int j = 0; j < Q; j++) { + rTmp[j] = 0.0; + for (int i = 0; i < P; i++) { + rTmp[j] += sTmp(tx_, i, sld) * sT(i, j); + } + } + } + __syncthreads(); + + // write rTmp[] into shmem as batch P of QxQ matrices + if (tx < (P * Q)) { + const int batchid = tx / Q; + const int tx_ = tx % Q; + const int sld = Q; + T *sTmp = swork + batchid * (Q * Q); + for (int j = 0; j < Q; j++) { + sTmp(tx_, j, sld) = rTmp[j]; + } + } + __syncthreads(); + + // Batch 1 of (Q^2xP_) matrices [shmem] times (PxQ) matrix [shmem] => Batch 1 of (Q^2xQ_) matrices [reg] + if (tx < (Q * Q)) { + // No need to declare batchid = (tx / Q^2) = always zero + // No need to declare tx_ = (tx_ % Q^2) = always tx + const int sld = Q * Q; + T *sTmp = swork; + for (int j = 0; j < Q; j++) { + rTmp[0] = 0.0; + for (int i = 0; i < P; i++) { + rTmp[0] += sTmp(tx, i, sld) * sT(i, j); + } + rV[0][comp][j] += rTmp[0]; + } + } + __syncthreads(); + } +} + +////////////////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__ + void magma_interpn_3d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, + const int cstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaNoTrans; + + if (elem_id >= nelem) return; + + CeedScalar rU[1][BASIS_NUM_COMP][BASIS_P] = {0.0}; // for a non-fused operator BASIS_DIM is always 1 + CeedScalar rV[1][BASIS_NUM_COMP][BASIS_Q] = {0.0}; // for a non-fused operator BASIS_DIM is always 1 + CeedScalar rTmp[BASIS_Q] = {0.0}; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sT = (CeedScalar *)shared_data; + CeedScalar *sTmp = sT + BASIS_P * BASIS_Q; + sTmp += ty * (max(BASIS_P * BASIS_P * BASIS_MAX_P_Q, BASIS_P * BASIS_Q * BASIS_Q)); + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dT, sT); + } + + // read U (idim = 0 for dU, i_DIM = 0 for rU, u_dimstride is always 0) + readU_3d(dU, cstrdU, rU, sTmp, tx); + // there is a sync at the end of this function + + magma_interp_3d_device(sT, transT, rU, rV, tx, rTmp, sTmp); + __syncthreads(); + + // write V + writeV_3d(dV, cstrdV, rV, tx); +} + +////////////////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__ + void magma_interpt_3d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, + const int cstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaTrans; + + if (elem_id >= nelem) return; + + CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0}; // for a non-fused operator BASIS_DIM is always 1 + CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0}; // for a non-fused operator BASIS_DIM is always 1 + CeedScalar rTmp[BASIS_P] = {0.0}; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sT = (CeedScalar *)shared_data; + CeedScalar *sTmp = sT + BASIS_Q * BASIS_P; + sTmp += ty * (max(BASIS_Q * BASIS_Q * BASIS_MAX_P_Q, BASIS_Q * BASIS_P * BASIS_P)); + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dT, sT); + } + + // read V + readV_3d(dV, cstrdV, rV, tx); + + // read U (idim = 0 for dU, i_DIM = 0 for rU, u_dimstride is always 0) + readU_3d(dU, cstrdU, rU, sTmp, tx); + // there is a sync at the end of this function + + magma_interp_3d_device(sT, transT, rU, rV, tx, rTmp, sTmp); + __syncthreads(); + + // write V + writeV_3d(dV, cstrdV, rV, tx); +} + +#endif // CEED_MAGMA_BASIS_INTERP_3D_H diff --git a/include/ceed/jit-source/magma/magma-basis-interp-nontensor.h b/include/ceed/jit-source/magma/magma-basis-interp-nontensor.h new file mode 100644 index 0000000000..956d69392a --- /dev/null +++ b/include/ceed/jit-source/magma/magma-basis-interp-nontensor.h @@ -0,0 +1,96 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for MAGMA non-tensor basis interpolation +#ifndef CEED_MAGMA_INTERP_NONTENSOR_H +#define CEED_MAGMA_INTERP_NONTENSOR_H + +#include "magma-common-nontensor.h" + +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ + void magma_interp_nontensor_n(int n, CeedScalar const *dA, int ldda, CeedScalar const *dB, int lddb, CeedScalar *dC, int lddc) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data); + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int id = blockIdx.x * blockDim.y + ty; + const int nblocks = MAGMA_CEILDIV(n, BASIS_NB_INTERP_N); + const int myn = min(BASIS_NB_INTERP_N, n - id * BASIS_NB_INTERP_N); + + dB += id * BASIS_NB_INTERP_N * lddb; + dC += id * BASIS_NB_INTERP_N * lddc; + + // A is BASIS_P x BASIS_Q + const int slda = BASIS_P; + const int sldb = BASIS_P; + CeedScalar *sA = (CeedScalar *)shared_data; + CeedScalar *sB = sA; + sB += ty * sldb * BASIS_NB_INTERP_N; + + // read A using all threads + CeedScalar rA[BASIS_P] = {MAGMA_D_ZERO}; + read_A_trans_g2r_1D_nosync(tx, ty, dA, ldda, sA, slda, rA); + __syncthreads(); + + // terminate threads with no work + if (id >= nblocks) return; + + // init rC + CeedScalar rC[BASIS_NB_INTERP_N] = {MAGMA_D_ZERO}; + + // read B + read_B_g2s_1D_nosync(tx, myn, dB, lddb, sB, sldb); + __syncthreads(); + + mul_rAsBrC_1D_nosync(tx, rA, sB, sldb, rC); + + // write C + write_C_r2g_1D_nosync(tx, myn, rC, dC, lddc); +} + +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ + void magma_interp_nontensor_t(int n, CeedScalar const *dA, int ldda, CeedScalar const *dB, int lddb, CeedScalar *dC, int lddc) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data); + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int id = blockIdx.x * blockDim.y + ty; + const int nblocks = MAGMA_CEILDIV(n, BASIS_NB_INTERP_T); + const int myn = min(BASIS_NB_INTERP_T, n - id * BASIS_NB_INTERP_T); + + // terminate threads with no work + if (id >= nblocks) return; + + dB += id * BASIS_NB_INTERP_T * lddb; + dC += id * BASIS_NB_INTERP_T * lddc; + + // A is BASIS_P x BASIS_Q + const int sldb = BASIS_Q; + CeedScalar *sB = (CeedScalar *)shared_data; + sB += ty * sldb * BASIS_NB_INTERP_T; + + // init rC + CeedScalar rC[BASIS_NB_INTERP_T] = {MAGMA_D_ZERO}; + + // read A + CeedScalar rA[BASIS_Q] = {MAGMA_D_ZERO}; + read_A_notrans_g2r_1D_nosync(tx, dA, ldda, NULL, 0, rA); + + // read B + read_B_g2s_1D_nosync(tx, myn, dB, lddb, sB, sldb); + __syncthreads(); + + mul_rAsBrC_1D_nosync(tx, rA, sB, sldb, rC); + + // write C + write_C_r2g_1D_nosync(tx, myn, rC, dC, lddc); +} + +#endif // CEED_MAGMA_INTERP_NONTENSOR_H diff --git a/include/ceed/jit-source/magma/weight-1d.h b/include/ceed/jit-source/magma/magma-basis-weight-1d.h similarity index 61% rename from include/ceed/jit-source/magma/weight-1d.h rename to include/ceed/jit-source/magma/magma-basis-weight-1d.h index e4a7abe18c..9f77dd4aca 100644 --- a/include/ceed/jit-source/magma/weight-1d.h +++ b/include/ceed/jit-source/magma/magma-basis-weight-1d.h @@ -5,20 +5,27 @@ // // This file is part of CEED: http://github.com/ceed +/// @file +/// Internal header for MAGMA tensor basis weight in 1D +#ifndef CEED_MAGMA_BASIS_WEIGHT_1D_H +#define CEED_MAGMA_BASIS_WEIGHT_1D_H + +#include "magma-common-tensor.h" + ////////////////////////////////////////////////////////////////////////////////////////// // weight basis action -- 1D -template -__device__ __inline__ void magma_weight_1d_device(const T *sTweight, T *sV, const int tx) { +template +static __device__ __inline__ void magma_weight_1d_device(const T *sTweight, T *sV, const int tx) { // Assumptions - // 1. 1D thread configuration of size Q_ - // 2. The output sV is in shared memory -- size 1xQ_ - if (tx < Q_) { + // 1. 1D thread configuration of size Q + // 2. The output sV is in shared memory -- size 1xQ + if (tx < Q) { sV[tx] = sTweight[tx]; } } ////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(Q, MAGMA_MAXTHREADS_1D)) __global__ +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ void magma_weight_1d_kernel(const CeedScalar *dqweight1d, CeedScalar *dV, const int v_stride, const int nelem) { MAGMA_DEVICE_SHARED(CeedScalar, shared_data) @@ -33,18 +40,20 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(Q, MAGMA_MAXTHREADS_1D)) __globa // shared memory pointers CeedScalar *sTweight = (CeedScalar *)shared_data; - CeedScalar *sV = sTweight + Q; - sV += ty * Q; + CeedScalar *sV = sTweight + BASIS_Q; + sV += ty * BASIS_Q; // read dqweight_1d - if (ty == 0 && tx < Q) { + if (ty == 0 && tx < BASIS_Q) { sTweight[tx] = dqweight1d[tx]; } __syncthreads(); - magma_weight_1d_device(sTweight, sV, tx); + magma_weight_1d_device(sTweight, sV, tx); __syncthreads(); // write V dV[tx] = sV[tx]; } + +#endif // CEED_MAGMA_BASIS_WEIGHT_1D_H diff --git a/include/ceed/jit-source/magma/weight-2d.h b/include/ceed/jit-source/magma/magma-basis-weight-2d.h similarity index 59% rename from include/ceed/jit-source/magma/weight-2d.h rename to include/ceed/jit-source/magma/magma-basis-weight-2d.h index 7ad62d7315..721b50f953 100644 --- a/include/ceed/jit-source/magma/weight-2d.h +++ b/include/ceed/jit-source/magma/magma-basis-weight-2d.h @@ -5,28 +5,35 @@ // // This file is part of CEED: http://github.com/ceed +/// @file +/// Internal header for MAGMA tensor basis weight in 2D +#ifndef CEED_MAGMA_BASIS_WEIGHT_2D_H +#define CEED_MAGMA_BASIS_WEIGHT_2D_H + +#include "magma-common-tensor.h" + ////////////////////////////////////////////////////////////////////////////////////////// // weight basis action -- 2D -template -__device__ __inline__ void magma_weight_2d_device(const T *sTweight, T rV[DIM_][NCOMP_][Q_], const int tx) { +template +static __device__ __inline__ void magma_weight_2d_device(const T *sTweight, T rV[DIM][NUM_COMP][Q], const int tx) { // Assumptions - // 1. 1D thread configuration of size Q_ + // 1. 1D thread configuration of size Q // 2. rV[][][] matches the storage used in other actions (interp, grad, ... etc) - // 3. iDIM and iCOMP specify which indexes to use in rV, - // since the output per thread is a register array of size Q_ + // 3. i_DIM and i_COMP specify which indexes to use in rV, + // since the output per thread is a register array of size Q // 4. Sync is recommended after the call (to make sure sTweight can be overwritten) - if (tx < Q_) { + if (tx < Q) { // x sTweight[j] for first update // x sTweight[tx] for second update - for (int j = 0; j < Q_; j++) { - rV[iDIM][iCOMP][j] = sTweight[j] * sTweight[tx]; + for (int j = 0; j < Q; j++) { + rV[i_DIM][i_COMP][j] = sTweight[j] * sTweight[tx]; } } } ////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(Q, MAGMA_MAXTHREADS_2D)) __global__ +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_2D)) __global__ void magma_weight_2d_kernel(const CeedScalar *dqweight1d, CeedScalar *dV, const int v_stride, const int nelem) { MAGMA_DEVICE_SHARED(CeedScalar, shared_data) @@ -36,7 +43,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(Q, MAGMA_MAXTHREADS_2D)) __globa if (elem_id >= nelem) return; - CeedScalar rV[1][1][Q]; // allocate with DIM=NCOMP=1, but sizes may differ for a fused operator + CeedScalar rV[1][1][BASIS_Q]; // allocate with BASIS_DIM=BASIS_NUM_COMP=1, but sizes may differ for a fused operator // global memory pointers dV += elem_id * v_stride; @@ -44,17 +51,19 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(Q, MAGMA_MAXTHREADS_2D)) __globa CeedScalar *sTweight = (CeedScalar *)shared_data; // read dqweight_1d - if (ty == 0 && tx < Q) { + if (ty == 0 && tx < BASIS_Q) { sTweight[tx] = dqweight1d[tx]; } __syncthreads(); - magma_weight_2d_device(sTweight, rV, tx); + magma_weight_2d_device(sTweight, rV, tx); // write V - if (tx < Q) { - for (int j = 0; j < Q; j++) { - dV[j * Q + tx] = rV[0][0][j]; + if (tx < BASIS_Q) { + for (int j = 0; j < BASIS_Q; j++) { + dV[j * BASIS_Q + tx] = rV[0][0][j]; } } } + +#endif // CEED_MAGMA_BASIS_WEIGHT_2D_H diff --git a/include/ceed/jit-source/magma/weight-3d.h b/include/ceed/jit-source/magma/magma-basis-weight-3d.h similarity index 55% rename from include/ceed/jit-source/magma/weight-3d.h rename to include/ceed/jit-source/magma/magma-basis-weight-3d.h index 07fc2286ca..835bca44cd 100644 --- a/include/ceed/jit-source/magma/weight-3d.h +++ b/include/ceed/jit-source/magma/magma-basis-weight-3d.h @@ -5,29 +5,36 @@ // // This file is part of CEED: http://github.com/ceed +/// @file +/// Internal header for MAGMA tensor basis weight in 3D +#ifndef CEED_MAGMA_BASIS_WEIGHT_3D_H +#define CEED_MAGMA_BASIS_WEIGHT_3D_H + +#include "magma-common-tensor.h" + ////////////////////////////////////////////////////////////////////////////////////////// // weight basis action -- 3D -template -__device__ __inline__ void magma_weight_3d_device(const T *sTweight, T rV[DIM_][NCOMP_][Q_], const int tx) { +template +static __device__ __inline__ void magma_weight_3d_device(const T *sTweight, T rV[DIM][NUM_COMP][Q], const int tx) { // Assumptions - // 1. 1D thread configuration of size Q_^2 + // 1. 1D thread configuration of size Q^2 // 2. rV[][][] matches the storage used in other actions (interp, grad, ... etc) - // 3. iDIM and iCOMP specify which indexes to use in rV, - // since the output per thread is a register array of size Q_ + // 3. i_DIM and i_COMP specify which indexes to use in rV, + // since the output per thread is a register array of size Q // 4. Sync is recommended after the call (to make sure sTweight can be overwritten) - if (tx < (Q_ * Q_)) { + if (tx < Q * Q) { // x sTweight[j] for first update - // x sTweight[tx%Q_] for second update - // x sTweight[tx/Q_] for third update - for (int j = 0; j < Q_; j++) { - rV[iDIM][iCOMP][j] = sTweight[j] * sTweight[tx % Q_] * sTweight[tx / Q_]; + // x sTweight[tx%Q] for second update + // x sTweight[tx/Q] for third update + for (int j = 0; j < Q; j++) { + rV[i_DIM][i_COMP][j] = sTweight[j] * sTweight[tx % Q] * sTweight[tx / Q]; } } } ////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(Q *Q, MAGMA_MAXTHREADS_3D)) __global__ +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q *BASIS_Q, MAGMA_MAXTHREADS_3D)) __global__ void magma_weight_3d_kernel(const CeedScalar *dqweight1d, CeedScalar *dV, const int v_stride, const int nelem) { MAGMA_DEVICE_SHARED(CeedScalar, shared_data) @@ -37,7 +44,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(Q *Q, MAGMA_MAXTHREADS_3D)) __gl if (elem_id >= nelem) return; - CeedScalar rV[1][1][Q]; // allocate with DIM=NCOMP=1, but sizes may differ for a fused operator + CeedScalar rV[1][1][BASIS_Q]; // allocate with BASIS_DIM=BASIS_NUM_COMP=1, but sizes may differ for a fused operator // global memory pointers dV += elem_id * v_stride; @@ -45,17 +52,19 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(Q *Q, MAGMA_MAXTHREADS_3D)) __gl CeedScalar *sTweight = (CeedScalar *)shared_data; // read dqweight_1d - if (tx < Q) { + if (tx < BASIS_Q) { sTweight[tx] = dqweight1d[tx]; } __syncthreads(); - magma_weight_3d_device(sTweight, rV, tx); + magma_weight_3d_device(sTweight, rV, tx); // write V - if (tx < (Q * Q)) { - for (int j = 0; j < Q; j++) { - dV[j * (Q * Q) + tx] = rV[0][0][j]; + if (tx < (BASIS_Q * BASIS_Q)) { + for (int j = 0; j < BASIS_Q; j++) { + dV[j * (BASIS_Q * BASIS_Q) + tx] = rV[0][0][j]; } } } + +#endif // CEED_MAGMA_BASIS_WEIGHT_3D_H diff --git a/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h b/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h new file mode 100644 index 0000000000..f6b51380cf --- /dev/null +++ b/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h @@ -0,0 +1,48 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for MAGMA non-tensor basis weight +#ifndef CEED_MAGMA_BASIS_WEIGHT_NONTENSOR_H +#define CEED_MAGMA_BASIS_WEIGHT_NONTENSOR_H + +#include "magma-common-nontensor.h" + +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ + void magma_weight_nontensor(int n, const CeedScalar *dqweight, CeedScalar *dV, int lddv) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data); + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int id = blockIdx.x * blockDim.y + ty; + + // terminate threads with no work + if (id >= n) return; + + dV += id * lddv; + + // shared memory pointers + CeedScalar *sqweight = (CeedScalar *)shared_data; + CeedScalar *sV = sqweight + BASIS_Q; + sV += ty * BASIS_Q; + + // read qweight + if (ty == 0 && tx < BASIS_Q) { + sqweight[tx] = dqweight[tx]; + } + __syncthreads(); + + if (tx < BASIS_Q) { + sV[tx] = sqweight[tx]; + } + + // write V + dV[tx] = sV[tx]; +} + +#endif // CEED_MAGMA_BASIS_WEIGHT_NONTENSOR_H diff --git a/include/ceed/jit-source/magma/magma_common_defs.h b/include/ceed/jit-source/magma/magma-common-defs.h similarity index 65% rename from include/ceed/jit-source/magma/magma_common_defs.h rename to include/ceed/jit-source/magma/magma-common-defs.h index a0cf1f93f6..24684be85e 100644 --- a/include/ceed/jit-source/magma/magma_common_defs.h +++ b/include/ceed/jit-source/magma/magma-common-defs.h @@ -5,6 +5,8 @@ // // This file is part of CEED: http://github.com/ceed +/// @file +/// Internal header for MAGMA backend common definitions #ifndef CEED_MAGMA_COMMON_DEFS_H #define CEED_MAGMA_COMMON_DEFS_H @@ -23,4 +25,14 @@ typedef enum { MagmaNoTrans = 111, MagmaTrans = 112, MagmaConjTrans = 113, Magma #define MAGMA_ROUNDUP(A, B) MAGMA_CEILDIV((A), (B)) * (B) #define MAGMA_MAX(A, B) ((A) > (B) ? (A) : (B)) +#define MAGMA_MAXTHREADS_1D 128 +#define MAGMA_MAXTHREADS_2D 128 +#define MAGMA_MAXTHREADS_3D 64 + +// Define macro for determining number of threads in y-direction for basis kernels +#define MAGMA_BASIS_NTCOL(x, maxt) (((maxt) < (x)) ? 1 : ((maxt) / (x))) + +// Define macro for computing the total threads in a block for use with __launch_bounds__() +#define MAGMA_BASIS_BOUNDS(x, maxt) (x * MAGMA_BASIS_NTCOL(x, maxt)) + #endif // CEED_MAGMA_COMMON_DEFS_H diff --git a/include/ceed/jit-source/magma/magma-common-nontensor.h b/include/ceed/jit-source/magma/magma-common-nontensor.h new file mode 100644 index 0000000000..0e1bbb007b --- /dev/null +++ b/include/ceed/jit-source/magma/magma-common-nontensor.h @@ -0,0 +1,146 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for MAGMA backend common non-tensor basis definitions +#ifndef CEED_MAGMA_COMMON_NONTENSOR_H +#define CEED_MAGMA_COMMON_NONTENSOR_H + +#include "magma-common-defs.h" + +//////////////////////////////////////////////////////////////////////////////// +// read A (no-trans) from global to reg. +// A is (P x Q) +// 1D thread config. with (P x 1) threads +// no sync at the end of the function +template +static __device__ __inline__ void read_A_notrans_g2r_1D_nosync(const int tx, const T *dA, int ldda, T *sA, int slda, T rA[Q]) { +#pragma unroll + for (int j = 0; j < Q; j++) { + rA[j] = dA[j * ldda + tx]; + } +} + +//////////////////////////////////////////////////////////////////////////////// +// read A (trans) from global to reg. +// A is (P x Q) +// 1D thread config. with (P x 1) threads +// no sync at the end of the function +template +static __device__ __inline__ void read_A_trans_g2r_1D_nosync(const int tx, const int ty, const T *dA, int ldda, T *sA, int slda, T rA[Q]) { + const int nTH = MAGMA_BASIS_BOUNDS(P, MAGMA_MAXTHREADS_1D); + const int tid = ty * blockDim.x + tx; + int i; + +#pragma unroll + for (i = 0; i < (Q * P) - nTH; i += nTH) { + sA[i + tid] = dA[i + tid]; + } + if (tid < ((Q * P) - i)) { + sA[i + tid] = dA[i + tid]; + } + __syncthreads(); + +#pragma unroll + for (int j = 0; j < Q; j++) { + rA[j] = sA[tx * slda + j]; + } +} + +//////////////////////////////////////////////////////////////////////////////// +// read B from global to shared +// B is (Q x NB) +// 1D thread config. with (P x 1) threads +// no sync at the end of the function +template +static __device__ __inline__ void read_B_g2s_1D_nosync(const int tx, const int n, const T *dB, int lddb, T *sB, int sldb) { + if (n != NB) { + for (int i = 0; i < (Q * n) - P; i += P) { + sB[i + tx] = dB[i + tx]; + } + } else { +#pragma unroll + for (int i = 0; i < (Q * NB) - P; i += P) { + sB[i + tx] = dB[i + tx]; + } + } + + // cleanup for B + const int stride = MAGMA_ROUNDUP(Q * n - P, P); + if (tx < (Q * n) - stride) { + sB[stride + tx] = dB[stride + tx]; + } +} + +//////////////////////////////////////////////////////////////////////////////// +// write C from reg. to global +// C is (P x NB) +// 1D thread config. with (P x 1) threads +// no sync at the end of the function +template +static __device__ __inline__ void write_C_r2g_1D_nosync(const int tx, const int n, T rC[NB], T *dC, int lddc) { + if (n != NB) { +#pragma unroll + for (int j = 0; j < NB; j++) { + if (j < n) { + dC[j * lddc + tx] = rC[j]; + } + } + } else { +#pragma unroll + for (int j = 0; j < NB; j++) { + dC[j * lddc + tx] = rC[j]; + } + } +} + +//////////////////////////////////////////////////////////////////////////////// +// multiply C = A x B using 1D threads in P x 1 config +// A (P x Q) in reg., one row per thread +// B (Q x NB) in shared memory +// C in registers -- one row per thread +// no sync at the end of the function +template +static __device__ __inline__ void mul_rAsBrC_1D_nosync(const int tx, T rA[Q], T *sB, int sldb, T rC[NB]) { + T rB[Q]; +#pragma unroll + for (int i = 0; i < NB; i++) { +#pragma unroll + for (int k = 0; k < Q; k++) { + rB[k] = sB[i * sldb + k]; + } + rC[i] = 0.0; +#pragma unroll + for (int k = 0; k < Q; k++) { + rC[i] += rA[k] * rB[k]; + } + } +} + +//////////////////////////////////////////////////////////////////////////////// +// multiply C += A x B using 1D threads in P x 1 config +// A (P x Q) in reg., one row per thread +// B (Q x NB) in shared memory +// C in registers -- one row per thread +// no sync at the end of the function +template +static __device__ __inline__ void addmul_rAsBrC_1D_nosync(const int tx, T rA[Q], T *sB, int sldb, T rC[NB]) { + T rB[Q]; +#pragma unroll + for (int i = 0; i < NB; i++) { +#pragma unroll + for (int k = 0; k < Q; k++) { + rB[k] = sB[i * sldb + k]; + } +#pragma unroll + for (int k = 0; k < Q; k++) { + rC[i] += rA[k] * rB[k]; + } + } +} + +#endif // CEED_MAGMA_COMMON_NONTENSOR_H diff --git a/include/ceed/jit-source/magma/magma-common-tensor.h b/include/ceed/jit-source/magma/magma-common-tensor.h new file mode 100644 index 0000000000..1ca3f52758 --- /dev/null +++ b/include/ceed/jit-source/magma/magma-common-tensor.h @@ -0,0 +1,207 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for MAGMA backend common tensor basis definitions +#ifndef CEED_MAGMA_COMMON_TENSOR_H +#define CEED_MAGMA_COMMON_TENSOR_H + +#include "magma-common-defs.h" + +////////////////////////////////////////////////////////////////////////////////////////// +// read U or V of a 1D element into shared memory sU[][] or sV[][] -- for all components +// the devptr is assumed to point directly to the element +// must sync after call +template +static __device__ __inline__ void read_1d(const T *devptr, const int compstride, T *sBuffer[NUM_COMP], const int tx) { + if (tx < LENGTH) { + for (int comp = 0; comp < NUM_COMP; comp++) { + sBuffer[comp][tx] = devptr[comp * compstride + tx]; + } + } +} + +////////////////////////////////////////////////////////////////////////////////////////// +// write V of a 1D element into global memory from sV[][] -- for all components +// the devptr is assumed to point directly to the element +template +static __device__ __inline__ void write_1d(T *sBuffer[NUM_COMP], T *devptr, const int compstride, const int tx) { + if (tx < LENGTH) { + for (int comp = 0; comp < NUM_COMP; comp++) { + devptr[comp * compstride + tx] = sBuffer[comp][tx]; + } + } +} + +////////////////////////////////////////////////////////////////////////////////////////// +// read U of a 2D element into registers rU[][][] -- for all components of a single dim +// dU is assumed to be offset by elem-stride and dim-stride +// register is assumed to be rU[DIM_U][NUM_COMP][rU_SIZE] +// i_DIM specifies which dimension is being read into in rU +// rU_SIZE can be different from P (e.g. MAXP_Q) +// sTmp is a shared memory workspace of size P^2 +template +static __device__ __inline__ void readU_2d(const T *dU, const int compstride, T rU[DIM_U][NUM_COMP][rU_SIZE], T *sTmp, const int tx) { + // read U as a batch P of (1 x P_) vectors + // vec 0 : [u0, u1, u2, ... u_(P-1)] -- contiguous in memory + // vec 1 : [u0, u1, u2, ... u_(P-1)] -- contiguous in memory + // ... + // vec P-1: [u0, u1, u2, ... u_(P-1)] -- contiguous in memory + // threads collaboratively read vec0 and then vec1 and so on + // but for the kernel, we want + // thread 0 to hold all of vec0 in registers, and + // thread 1 to hold all of vec1 in registers, and and so on + // so we need to transpose + for (int comp = 0; comp < NUM_COMP; comp++) { + // read from global memory into shared memory + if (tx < P) { + for (int i = 0; i < P; i++) { + sTmp[i * P + tx] = dU[comp * compstride + i * P + tx]; + } + } + __syncthreads(); + + if (tx < P) { + for (int i = 0; i < P; i++) { + rU[i_DIM][comp][i] = sTmp[tx * P + i]; + } + } + __syncthreads(); + } +} + +////////////////////////////////////////////////////////////////////////////////////////// +// read V of a 2D element into registers rV[][][] -- for all components of a single dim +// dV is assumed to be offset by elem-stride and dim-stride +// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE] +// i_DIM specifies which dimension is being read into in rV +// rV_SIZE can be different from P (e.g. MAXP_Q) +template +static __device__ __inline__ void readV_2d(const T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) { + if (tx < Q) { + for (int comp = 0; comp < NUM_COMP; comp++) { + for (int j = 0; j < Q; j++) { + rV[i_DIM][comp][j] = dV[comp * compstride + j * Q + tx]; + } + } + } +} + +////////////////////////////////////////////////////////////////////////////////////////// +// write V of a 2D element from registers rV[][][] to global memory -- for all components of a single dim +// dV is assumed to be offset by elem-stride and dim-stride +// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE] +// i_DIM specifies which dimension is being read from in rV +// idim specifies which dimension is being written to in dV +// rV_SIZE can be different from P (e.g. MAXP_Q) +template +static __device__ __inline__ void writeV_2d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) { + if (tx < Q) { + for (int comp = 0; comp < NUM_COMP; comp++) { + for (int j = 0; j < Q; j++) { + dV[comp * compstride + j * Q + tx] = rV[i_DIM][comp][j]; + } + } + } +} + +////////////////////////////////////////////////////////////////////////////////////////// +// read U of a 3D element into registers rU[][][] -- for all components of a single dim +// dU is assumed to be offset by elem-stride and dim-stride +// register is assumed to be rU[DIM_U][NUM_COMP][rU_SIZE] +// i_DIM specifies which dimension is being read into in rU +// rU_SIZE can be different from P (e.g. MAXP_Q) +// sTmp is a shared memory workspace of size P^3 +template +static __device__ __inline__ void readU_3d(const T *dU, const int compstride, T rU[DIM_U][NUM_COMP][rU_SIZE], T *sTmp, const int tx) { + // read U as a batch P^2 of (1 x P_) vectors + // vec 0 : [u0, u1, u2, ... u_(P-1)] -- contiguous in memory + // vec 1 : [u0, u1, u2, ... u_(P-1)] -- contiguous in memory + // ... + // vec P^2-1: [u0, u1, u2, ... u_(P-1)] -- contiguous in memory + // threads collaboratively read vec0 and then vec1 and so on + // but for the kernel, we want + // thread 0 to hold all of vec0 in registers, and + // thread 1 to hold all of vec1 in registers, and and so on + // so we need to transpose + for (int comp = 0; comp < NUM_COMP; comp++) { + // read from global memory into shared memory + if (tx < P * P) { + for (int i = 0; i < P; i++) { + sTmp[i * P * P + tx] = dU[comp * compstride + i * P * P + tx]; + } + } + __syncthreads(); + + if (tx < P * P) { + for (int i = 0; i < P; i++) { + rU[i_DIM][comp][i] = sTmp[tx * P + i]; + } + } + __syncthreads(); + } +} + +////////////////////////////////////////////////////////////////////////////////////////// +// read V of a 3D element into registers rV[][][] -- for all components of a single dim +// dV is assumed to be offset by elem-stride and dim-stride +// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE] +// i_DIM specifies which dimension is being read into in rV +// rV_SIZE can be different from P (e.g. MAXP_Q) +template +static __device__ __inline__ void readV_3d(const T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) { + if (tx < Q * Q) { + for (int comp = 0; comp < NUM_COMP; comp++) { + for (int j = 0; j < Q; j++) { + rV[i_DIM][comp][j] = dV[comp * compstride + j * (Q * Q) + tx]; + } + } + } +} + +////////////////////////////////////////////////////////////////////////////////////////// +// write V of a 3D element from registers rV[][][] to global memory -- for all components of a single dim +// dV is assumed to point directly to the element (i.e. already offset by elem-stride) +// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE] +// i_DIM specifies which dimension is being read from in rV +// idim specifies which dimension is being written to in dV +// rV_SIZE can be different from P (e.g. MAXP_Q) +template +static __device__ __inline__ void writeV_3d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) { + if (tx < (Q * Q)) { + for (int comp = 0; comp < NUM_COMP; comp++) { + for (int j = 0; j < Q; j++) { + dV[comp * compstride + j * (Q * Q) + tx] = rV[i_DIM][comp][j]; + } + } + } +} + +////////////////////////////////////////////////////////////////////////////////////////// +// reads T into shared memory +// must sync after call +template +static __device__ __inline__ void dread_T_gm2sm(const int tx, const magma_trans_t transT, const CeedScalar *dT, CeedScalar *sT) { + if (transT == MagmaNoTrans) { + // T is B x J + if (tx < B) { + for (int i = 0; i < J; i++) { + sT[i * B + tx] = dT[i * B + tx]; + } + } + } else { + // T is J x B + if (tx < J) { + for (int i = 0; i < B; i++) { + sT[tx * B + i] = dT[i * J + tx]; + } + } + } + // must sync after call +} + +#endif // CEED_MAGMA_COMMON_TENSOR_H diff --git a/include/ceed/jit-source/magma/magma_common_nontensor.h b/include/ceed/jit-source/magma/magma_common_nontensor.h deleted file mode 100644 index edfd805db8..0000000000 --- a/include/ceed/jit-source/magma/magma_common_nontensor.h +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_MAGMA_COMMON_NONTENSOR_H -#define CEED_MAGMA_COMMON_NONTENSOR_H - -#define NONTENSOR_MAX_THREADS (128) - -#ifndef MAGMA_DEVICE_SHARED -#define MAGMA_DEVICE_SHARED -#ifdef CEED_MAGMA_USE_HIP -#define MAGMA_DEVICE_SHARED(type, name) HIP_DYNAMIC_SHARED(type, name) -#else -#define MAGMA_DEVICE_SHARED(type, name) extern __shared__ type name[]; -#endif // CEED_MAGMA_USE_HIP -#endif // MAGMA_DEVICE_SHARED - -#define MAGMA_NONTENSOR_BASIS_NTCOL(N) (MAGMA_MAX(1, (NONTENSOR_MAX_THREADS / (N)))) - -#define dA(i, j) dA[(j)*ldda + (i)] -#define sA(i, j) sA[(j)*slda + (i)] -#define dB(i, j) dB[(j)*lddb + (i)] -#define sB(i, j) sB[(j)*sldb + (i)] - -//////////////////////////////////////////////////////////////////////////////// -// read C from global to reg. -// C is (P_ x NB_) -// 1D thread config. with (Mx1) threads -// no sync at the end of the function -template -static __device__ __inline__ void read_C_g2r_1D_nosync(const int tx, const int n, T *dC, int lddc, const T &beta, T rC[NB_]) { - if (n != NB_) { -#pragma unroll - for (int j = 0; j < NB_; j++) { - rC[j] = (j < n) ? beta * dC[j * lddc + tx] : 0; - } - } else { -#pragma unroll - for (int j = 0; j < NB_; j++) { - rC[j] = beta * dC[j * lddc + tx]; - } - } -} - -//////////////////////////////////////////////////////////////////////////////// -// write C from reg. to global -// C is (P_ x NB_) -// 1D thread config. with (Mx1) threads -// no sync at the end of the function -template -static __device__ __inline__ void write_C_r2g_1D_nosync(const int tx, const int n, T rC[NB_], T *dC, int lddc) { - if (n != NB_) { -#pragma unroll - for (int j = 0; j < NB_; j++) { - if (j < n) { - dC[j * lddc + tx] = rC[j]; - } - } - } else { -#pragma unroll - for (int j = 0; j < NB_; j++) { - dC[j * lddc + tx] = rC[j]; - } - } -} - -//////////////////////////////////////////////////////////////////////////////// -// read A (no-trans) from global to reg. -// A is (P_ x Q_) -// 1D thread config. with (Mx1) threads -// no sync at the end of the function -template -static __device__ __inline__ void read_A_notrans_g2r_1D_nosync(const int tx, const T *dA, int ldda, T *sA, int slda, T rA[Q_]) { -#pragma unroll - for (int j = 0; j < Q_; j++) { - rA[j] = dA(tx, j); - } -} - -//////////////////////////////////////////////////////////////////////////////// -// read A (no-trans) from global to reg. -// A is (P_ x Q_) -// 1D thread config. with (Mx1) threads -// no sync at the end of the function -template -static __device__ __inline__ void read_A_trans_g2r_1D_nosync(const int tx, const int ty, const T *dA, int ldda, T *sA, int slda, T rA[Q_]) { - int ix = 0; - const int nTH = P_ * MAGMA_NONTENSOR_BASIS_NTCOL(P_); - const int tid = ty * blockDim.x + tx; - -#pragma unroll - for (ix = 0; ix < (Q_ * P_) - nTH; ix += nTH) { - sA[ix + tid] = dA[ix + tid]; - } - - if (tid < ((Q_ * P_) - ix)) { - sA[ix + tid] = dA[ix + tid]; - } - __syncthreads(); - -#pragma unroll - for (int j = 0; j < Q_; j++) { - rA[j] = sA[tx * slda + j]; - } -} - -//////////////////////////////////////////////////////////////////////////////// -// read B from global to shared -// B is (Q_ x NB_) -// 1D thread config. with (Mx1) threads -// no sync at the end of the function -template -static __device__ __inline__ void read_B_g2s_1D_nosync(const int tx, int n, const T *dB, int lddb, T *sB, int sldb) { - if (n != NB_) { - for (int i = 0; i < (Q_ * n) - P_; i += P_) { - sB[i + tx] = dB[i + tx]; - } - } else { -#pragma unroll - for (int i = 0; i < (Q_ * NB_) - P_; i += P_) { - sB[i + tx] = dB[i + tx]; - } - } - - // cleanup for B - const int stride = MAGMA_ROUNDUP(Q_ * n - P_, P_); - if (tx < (Q_ * n) - stride) { - sB[stride + tx] = dB[stride + tx]; - } -} - -//////////////////////////////////////////////////////////////////////////////// -// multiply C = AxB using 1D threads in Mx1 config -// A (MxK) in reg., one row per thread -// B (KxNB) in shared memory -// C in registers -- one row per thread -// no sync at the end of the function -template -static __device__ __inline__ void mul_rAsBrC_1D_nosync(const int tx, const T &alpha, T rA[Q_], T *sB, int sldb, T rC[NB_]) { - T rB[Q_] = {0}; -#pragma unroll - for (int i = 0; i < NB_; i++) { -#pragma unroll - for (int k = 0; k < Q_; k++) { - rB[k] = sB[i * sldb + k]; - } - - T rTmp = 0; -#pragma unroll - for (int k = 0; k < Q_; k++) { - rTmp += rA[k] * rB[k]; - } - rC[i] += alpha * rTmp; - } -} - -#undef dA -#undef sA -#undef dB -#undef sB - -#endif // CEED_MAGMA_COMMON_NONTENSOR_H diff --git a/include/ceed/jit-source/magma/magma_common_tensor.h b/include/ceed/jit-source/magma/magma_common_tensor.h deleted file mode 100644 index 48ad0fa195..0000000000 --- a/include/ceed/jit-source/magma/magma_common_tensor.h +++ /dev/null @@ -1,284 +0,0 @@ -// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -#ifndef CEED_MAGMA_COMMON_TENSOR_H -#define CEED_MAGMA_COMMON_TENSOR_H - -#define MAGMA_MAXTHREADS_1D 128 -#define MAGMA_MAXTHREADS_2D 128 -#define MAGMA_MAXTHREADS_3D 64 -// Define macro for determining number of threads in y-direction -// for basis kernels -#define MAGMA_BASIS_NTCOL(x, maxt) (((maxt) < (x)) ? 1 : ((maxt) / (x))) -// Define macro for computing the total threads in a block -// for use with __launch_bounds__() -#define MAGMA_BASIS_BOUNDS(x, maxt) (x * MAGMA_BASIS_NTCOL(x, maxt)) - -////////////////////////////////////////////////////////////////////////////////////////// -// read U or V of a 1D element into shared memory sU[][] or sV[][] -- for all components -// the devptr is assumed to point directly to the element -// must sync after call -template -__device__ __inline__ void read_1d(const T *devptr, const int compstride, T *sBuffer[NCOMP_], const int tx) { - if (tx < LENGTH) { - for (int icomp = 0; icomp < NCOMP_; icomp++) { - sBuffer[icomp][tx] = devptr[icomp * compstride + tx]; - } - } -} - -////////////////////////////////////////////////////////////////////////////////////////// -// write V of a 1D element into global memory from sV[][] -- for all components -// the devptr is assumed to point directly to the element -template -__device__ __inline__ void write_1d(T *sBuffer[NCOMP_], T *devptr, const int compstride, const int tx) { - if (tx < LENGTH) { - for (int icomp = 0; icomp < NCOMP_; icomp++) { - devptr[icomp * compstride + tx] = sBuffer[icomp][tx]; - } - } -} - -////////////////////////////////////////////////////////////////////////////////////////// -// read U of a 2D element into registers rU[][][] -- for all components of a single dim -// dU is assumed to be offset by elem-stride and dim-stride -// register is assumed to be rU[DIMU][NCOMP_][rUsize] -// iDIM specifies which dimension is being read into in rU -// rUsize can be different from P_ (e.g. MAXP_Q) -// sTmp is a shared memory workspace of size P_^2 -template -__device__ __inline__ void readU_2d(const T *dU, const int compstride, T rU[DIMU][NCOMP_][rUsize], T *sTmp, const int tx) { - // read U as a batch P_ of (1xP_) vectors - // vec 0 : [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory - // vec 1 : [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory - // ... - // vec P_-1: [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory - // threads collaboratively read vec0 and then vec1 and so on - // but for the kernel, we want - // thread 0 to hold all of vec0 in registers, and - // thread 1 to hold all of vec1 in registers, and and so on - // so we need to transpose - for (int icomp = 0; icomp < NCOMP_; icomp++) { - // read from global memory into shared memory - if (tx < P_) { - for (int i = 0; i < P_; i++) { - sTmp[i * P_ + tx] = dU[icomp * compstride + i * P_ + tx]; - } - } - __syncthreads(); - - if (tx < P_) { - for (int i = 0; i < P_; i++) { - rU[iDIM][icomp][i] = sTmp[tx * P_ + i]; - } - } - __syncthreads(); - } -} - -////////////////////////////////////////////////////////////////////////////////////////// -// read V of a 2D element into registers rV[][][] -- for all components of a single dim -// dV is assumed to be offset by elem-stride and dim-stride -// register is assumed to be rV[DIMV][NCOMP_][rVsize] -// iDIM specifies which dimension is being read into in rV -// rVsize can be different from P_ (e.g. MAXP_Q) -template -__device__ __inline__ void readV_2d(const T *dV, const int compstride, T rV[DIMV][NCOMP_][rVsize], const int tx) { - if (tx < Q_) { - for (int icomp = 0; icomp < NCOMP_; icomp++) { - for (int j = 0; j < Q_; j++) { - rV[iDIM][icomp][j] = dV[icomp * compstride + j * Q_ + tx]; - } - } - } -} - -////////////////////////////////////////////////////////////////////////////////////////// -// write V of a 2D element from registers rV[][][] to global memory -- for all components of a single dim -// dV is assumed to be offset by elem-stride and dim-stride -// register is assumed to be rV[DIMV][NCOMP_][rVsize] -// iDIM specifies which dimension is being read from in rV -// idim specifies which dimension is being written to in dV -// rVsize can be different from P_ (e.g. MAXP_Q) -template -__device__ __inline__ void writeV_2d(T *dV, const int compstride, T rV[DIMV][NCOMP_][rVsize], const int tx) { - if (tx < Q_) { - for (int icomp = 0; icomp < NCOMP_; icomp++) { - for (int j = 0; j < Q_; j++) { - dV[icomp * compstride + j * Q_ + tx] = rV[iDIM][icomp][j]; - } - } - } -} - -////////////////////////////////////////////////////////////////////////////////////////// -// read U of a 3D element into registers rU[][][] -- for all components of a single dim -// dU is assumed to be offset by elem-stride and dim-stride -// register is assumed to be rU[DIMU][NCOMP_][rUsize] -// iDIM specifies which dimension is being read into in rU -// rUsize can be different from P_ (e.g. MAXP_Q) -// sTmp is a shared memory workspace of size P_^3 -template -__device__ __inline__ void readU_3d(const T *dU, const int compstride, T rU[DIMU][NCOMP_][rUsize], T *sTmp, const int tx) { - // read U as a batch P_^2 of (1xP_) vectors - // vec 0 : [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory - // vec 1 : [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory - // ... - // vec P_^2-1: [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory - // threads collaboratively read vec0 and then vec1 and so on - // but for the kernel, we want - // thread 0 to hold all of vec0 in registers, and - // thread 1 to hold all of vec1 in registers, and and so on - // so we need to transpose - for (int icomp = 0; icomp < NCOMP_; icomp++) { - // read from global memory into shared memory - if (tx < P_ * P_) { - for (int i = 0; i < P_; i++) { - sTmp[i * P_ * P_ + tx] = dU[icomp * compstride + i * P_ * P_ + tx]; - } - } - __syncthreads(); - - if (tx < P_ * P_) { - for (int i = 0; i < P_; i++) { - rU[iDIM][icomp][i] = sTmp[tx * P_ + i]; - } - } - __syncthreads(); - } -} - -////////////////////////////////////////////////////////////////////////////////////////// -// read V of a 3D element into registers rV[][][] -- for all components of a single dim -// dV is assumed to be offset by elem-stride and dim-stride -// register is assumed to be rV[DIMV][NCOMP_][rVsize] -// iDIM specifies which dimension is being read into in rV -// rVsize can be different from P_ (e.g. MAXP_Q) -template -__device__ __inline__ void readV_3d(const T *dV, const int compstride, T rV[DIMV][NCOMP_][rVsize], const int tx) { - if (tx < Q_ * Q_) { - for (int icomp = 0; icomp < NCOMP_; icomp++) { - for (int j = 0; j < Q_; j++) { - rV[iDIM][icomp][j] = dV[icomp * compstride + j * (Q_ * Q_) + tx]; - } - } - } -} - -////////////////////////////////////////////////////////////////////////////////////////// -// write V of a 3D element from registers rV[][][] to global memory -- for all components of a single dim -// dV is assumed to point directly to the element (i.e. already offset by elem-stride) -// register is assumed to be rV[DIMV][NCOMP_][rVsize] -// iDIM specifies which dimension is being read from in rV -// idim specifies which dimension is being written to in dV -// rVsize can be different from P_ (e.g. MAXP_Q) -template -__device__ __inline__ void writeV_3d(T *dV, const int compstride, T rV[DIMV][NCOMP_][rVsize], const int tx) { - if (tx < (Q_ * Q_)) { - for (int icomp = 0; icomp < NCOMP_; icomp++) { - for (int j = 0; j < Q_; j++) { - dV[icomp * compstride + j * (Q_ * Q_) + tx] = rV[iDIM][icomp][j]; - } - } - } -} - -////////////////////////////////////////////////////////////////////////////////////////// -// reads T into shared memory -// must sync after call -template -__device__ __inline__ void dread_T_gm2sm(const int tx, const magma_trans_t transT, const CeedScalar *dT, CeedScalar *sT) { - if (transT == MagmaNoTrans) { - // T is B x J - if (tx < B) { - for (int i = 0; i < J; i++) { - sT[i * B + tx] = dT[i * B + tx]; - } - } - } else { - // T is J x B - if (tx < J) { - for (int i = 0; i < B; i++) { - sT[tx * B + i] = dT[i * J + tx]; - } - } - } - // must sync after call -} - -////////////////////////////////////////////////////////////////////////////////////////// -// reads a slice of U from shared/global memory into registers -// the correct pointer U must be precomputed -template -__device__ __inline__ void dread_U_gsm2reg(const int C, const int tx_, const CeedScalar *U, CeedScalar rU[B]) { - for (int i = 0; i < B; i++) { - rU[i] = U[i * C + tx_]; - } -} - -////////////////////////////////////////////////////////////////////////////////////////// -// reads a slice of V from shared/global memory into registers with scaling -// the correct pointer V must be precomputed -template -__device__ __inline__ void dread_V_gsm2reg(const int C, const int tx_, const CeedScalar *V, CeedScalar rV[J]) { - for (int i = 0; i < J; i++) { - rV[i] = V[i * C + tx_]; - } -} - -////////////////////////////////////////////////////////////////////////////////////////// -// writes a slice of V from reg to shared/global memory -// the correct pointer V must be precomputed -template -__device__ __inline__ void dwrite_V_reg2gsm(const int C, const int tx_, CeedScalar rV[J], CeedScalar *V) { - for (int i = 0; i < J; i++) { - V[i * C + tx_] = rV[i]; - } -} - -////////////////////////////////////////////////////////////////////////////////////////// -// multiply a slice of U times T to produce a slice of V -template -__device__ __inline__ void dgemm_slice(CeedScalar alpha, CeedScalar *sT, CeedScalar rU[B], CeedScalar beta, CeedScalar rV[J]) { - CeedScalar rTmp; - for (int j = 0; j < J; j++) { - rTmp = 0.0; - for (int b = 0; b < B; b++) { - rTmp += rU[b] * sT[j * B + b]; - } - rV[j] *= beta; - rV[j] += alpha * rTmp; - } -} - -////////////////////////////////////////////////////////////////////////////////////////// -template -__device__ __inline__ void dgemm_ceed_device(const int tx, const int A, const int C, magma_trans_t transT, CeedScalar *sT, const CeedScalar alpha, - const CeedScalar beta, const CeedScalar *dU, CeedScalar *dV, CeedScalar rU[B], CeedScalar rV[J]) { - const int tx_ = tx % C; - const int slice_id = tx / C; - - // advance pointers for U and V - dU += slice_id * C * B; - dV += slice_id * C * J; - - // read V if beta is non-zero - if (beta != 0.0) { - dread_V_gsm2reg(C, tx_, (const CeedScalar *)dV, rV); - } - - // read U - dread_U_gsm2reg(C, tx_, dU, rU); - - // multiply - dgemm_slice(alpha, sT, rU, beta, rV); - - // write V back - dwrite_V_reg2gsm(C, tx_, rV, dV); -} - -#endif // CEED_MAGMA_COMMON_TENSOR_H