diff --git a/Makefile b/Makefile
index fbb9ba415c..7c9174a6cb 100644
--- a/Makefile
+++ b/Makefile
@@ -275,8 +275,6 @@ cuda-gen.cu    := $(sort $(wildcard backends/cuda-gen/kernels/*.cu))
 occa.cpp       := $(sort $(shell find backends/occa -type f -name *.cpp))
 magma.c        := $(sort $(wildcard backends/magma/*.c))
 magma.cpp      := $(sort $(wildcard backends/magma/*.cpp))
-magma.cu       := $(sort $(wildcard backends/magma/kernels/cuda/*.cu))
-magma.hip      := $(sort $(wildcard backends/magma/kernels/hip/*.hip.cpp))
 hip.c          := $(sort $(wildcard backends/hip/*.c))
 hip.cpp        := $(sort $(wildcard backends/hip/*.cpp))
 hip-ref.c      := $(sort $(wildcard backends/hip-ref/*.c))
@@ -491,10 +489,8 @@ ifneq ($(wildcard $(MAGMA_DIR)/lib/libmagma.*),)
       PKG_LIBS += $(magma_link)
       libceed.c   += $(magma.c)
       libceed.cpp += $(magma.cpp)
-      libceed.cu  += $(magma.cu)
       $(magma.c:%.c=$(OBJDIR)/%.o) $(magma.c:%=%.tidy) : CPPFLAGS += -DADD_ -I$(MAGMA_DIR)/include -I$(CUDA_DIR)/include
       $(magma.cpp:%.cpp=$(OBJDIR)/%.o) $(magma.cpp:%=%.tidy) : CPPFLAGS += -DADD_ -I$(MAGMA_DIR)/include -I$(CUDA_DIR)/include
-      $(magma.cu:%.cu=$(OBJDIR)/%.o) : CPPFLAGS += --compiler-options=-fPIC -DADD_ -I$(MAGMA_DIR)/include -I$(MAGMA_DIR)/magmablas -I$(CUDA_DIR)/include
       MAGMA_BACKENDS = /gpu/cuda/magma /gpu/cuda/magma/det
     endif
   else  # HIP MAGMA
@@ -507,10 +503,8 @@ ifneq ($(wildcard $(MAGMA_DIR)/lib/libmagma.*),)
       PKG_LIBS += $(magma_link)
       libceed.c   += $(magma.c)
       libceed.cpp += $(magma.cpp)
-      libceed.hip += $(magma.hip)
       $(magma.c:%.c=$(OBJDIR)/%.o) $(magma.c:%=%.tidy) : CPPFLAGS += $(HIPCONFIG_CPPFLAGS) -I$(MAGMA_DIR)/include -I$(ROCM_DIR)/include -DCEED_MAGMA_USE_HIP -DADD_
       $(magma.cpp:%.cpp=$(OBJDIR)/%.o) $(magma.cpp:%=%.tidy) : CPPFLAGS += $(HIPCONFIG_CPPFLAGS) -I$(MAGMA_DIR)/include -I$(ROCM_DIR)/include -DCEED_MAGMA_USE_HIP -DADD_
-      $(magma.hip:%.hip.cpp=$(OBJDIR)/%.o) : CPPFLAGS += -I$(MAGMA_DIR)/include -I$(MAGMA_DIR)/magmablas -I$(ROCM_DIR)/include -DCEED_MAGMA_USE_HIP -DADD_
       MAGMA_BACKENDS = /gpu/hip/magma /gpu/hip/magma/det
     endif
   endif
diff --git a/backends/magma/ceed-magma-basis.c b/backends/magma/ceed-magma-basis.c
index 3cb651f274..7023544b1b 100644
--- a/backends/magma/ceed-magma-basis.c
+++ b/backends/magma/ceed-magma-basis.c
@@ -20,57 +20,56 @@
 #include "ceed-magma-common.h"
 #include "ceed-magma.h"
 
-#ifdef __cplusplus
-CEED_INTERN "C"
-#endif
-    int
-    CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector U, CeedVector V) {
+#include "ceed-magma-gemm-nontensor.h"
+#include "ceed-magma-gemm-selector.h"
+
+//------------------------------------------------------------------------------
+// Basis apply - tensor
+//------------------------------------------------------------------------------
+static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, CeedVector v) {
   Ceed              ceed;
   Ceed_Magma       *data;
-  CeedInt           dim, num_comp, num_dof, P_1d, Q_1d;
-  const CeedScalar *du;
-  CeedScalar       *dv;
+  CeedInt           dim, num_comp, num_nodes, P_1d, Q_1d, P, Q;
+  const CeedScalar *d_u;
+  CeedScalar       *d_v;
   CeedBasis_Magma  *impl;
 
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
-  CeedCallBackend(CeedBasisGetDimension(basis, &dim));
-  CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedBasisGetNumNodes(basis, &num_dof));
-
   CeedCallBackend(CeedGetData(ceed, &data));
-
-  if (U != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du));
-  else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv));
-
   CeedCallBackend(CeedBasisGetData(basis, &impl));
-
+  CeedCallBackend(CeedBasisGetDimension(basis, &dim));
+  CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+  CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
   CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+  P = P_1d;
+  Q = Q_1d;
+  if (t_mode == CEED_TRANSPOSE) {
+    P = Q_1d;
+    Q = P_1d;
+  }
 
-  CeedDebug256(ceed, 4, "[CeedBasisApply_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, num_comp * CeedIntPow(P_1d, dim), num_comp);
+  // Read vectors
+  if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
+  else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
+  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
+  // Clear v for transpose operation
   if (t_mode == CEED_TRANSPOSE) {
     CeedSize length;
 
-    CeedCallBackend(CeedVectorGetLength(V, &length));
+    CeedCallBackend(CeedVectorGetLength(v, &length));
     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
-      magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)dv, length, data->queue);
+      magmablas_slaset(MagmaFull, length, 1, 0.0, 0.0, (float *)d_v, length, data->queue);
     } else {
-      magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)dv, length, data->queue);
+      magmablas_dlaset(MagmaFull, length, 1, 0.0, 0.0, (double *)d_v, length, data->queue);
     }
     ceed_magma_queue_sync(data->queue);
   }
 
+  // Apply basis operation
   switch (e_mode) {
     case CEED_EVAL_INTERP: {
-      CeedInt P = P_1d, Q = Q_1d;
-
-      if (t_mode == CEED_TRANSPOSE) {
-        P = Q_1d;
-        Q = P_1d;
-      }
-
       // Define element sizes for dofs/quad
       CeedInt elem_qpts_size = CeedIntPow(Q_1d, dim);
       CeedInt elem_dofs_size = CeedIntPow(P_1d, dim);
@@ -81,7 +80,7 @@ CEED_INTERN "C"
       //       node                            node
 
       // ---  Define strides for NOTRANSPOSE mode: ---
-      // Input (du) is E-vector, output (dv) is Q-vector
+      // Input (d_u) is E-vector, output (d_v) is Q-vector
 
       // Element strides
       CeedInt u_elem_stride = elem_dofs_size;
@@ -89,10 +88,8 @@ CEED_INTERN "C"
       // Component strides
       CeedInt u_comp_stride = num_elem * elem_dofs_size;
       CeedInt v_comp_stride = num_elem * elem_qpts_size;
-
-      // ---  Swap strides for TRANSPOSE mode: ---
       if (t_mode == CEED_TRANSPOSE) {
-        // Input (du) is Q-vector, output (dv) is E-vector
+        // Input (d_u) is Q-vector, output (d_v) is E-vector
         // Element strides
         v_elem_stride = elem_dofs_size;
         u_elem_stride = elem_qpts_size;
@@ -115,42 +112,37 @@ CEED_INTERN "C"
         case 2:
           num_threads = max_P_Q;
           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D);
-          shared_mem += P * Q * sizeof(CeedScalar);                      // for sT
-          shared_mem += num_t_col * (P * max_P_Q * sizeof(CeedScalar));  // for reforming rU we need PxP, and for the intermediate output we need PxQ
+          shared_mem += P * Q * sizeof(CeedScalar);  // for sT
+          // for reforming rU we need P x P, and for the intermediate output we need P x Q
+          shared_mem += num_t_col * (P * max_P_Q * sizeof(CeedScalar));
           break;
         case 3:
           num_threads = max_P_Q * max_P_Q;
           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D);
           shared_mem += sizeof(CeedScalar) * (P * Q);  // for sT
-          shared_mem += sizeof(CeedScalar) * num_t_col *
-                        (CeedIntMax(P * P * max_P_Q,
-                                    P * Q * Q));  // rU needs P^2xP, the intermediate output needs max(P^2xQ,PQ^2)
+          // rU needs P^2 x P, the intermediate output needs max(P^2 x Q, P x Q^2)
+          shared_mem += sizeof(CeedScalar) * num_t_col * (CeedIntMax(P * P * max_P_Q, P * Q * Q));
+          break;
       }
-      CeedInt grid   = (num_elem + num_t_col - 1) / num_t_col;
-      void   *args[] = {&impl->d_interp_1d, &du, &u_elem_stride, &u_comp_stride, &dv, &v_elem_stride, &v_comp_stride, &num_elem};
+      CeedInt grid   = CeedDivUpInt(num_elem, num_t_col);
+      void   *args[] = {&impl->d_interp_1d, &d_u, &u_elem_stride, &u_comp_stride, &d_v, &v_elem_stride, &v_comp_stride, &num_elem};
 
       if (t_mode == CEED_TRANSPOSE) {
-        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp_tr, grid, num_threads, num_t_col, 1, shared_mem, args));
+        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->InterpTranspose, grid, num_threads, num_t_col, 1, shared_mem, args));
       } else {
-        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp, grid, num_threads, num_t_col, 1, shared_mem, args));
+        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Interp, grid, num_threads, num_t_col, 1, shared_mem, args));
       }
     } break;
     case CEED_EVAL_GRAD: {
-      CeedInt P = P_1d, Q = Q_1d;
-
-      // In CEED_NOTRANSPOSE mode:
-      // du is (P^dim x nc), column-major layout (nc = num_comp)
-      // dv is (Q^dim x nc x dim), column-major layout (nc = num_comp)
-      // In CEED_TRANSPOSE mode, the sizes of du and dv are switched.
-      if (t_mode == CEED_TRANSPOSE) {
-        P = Q_1d;
-        Q = P_1d;
-      }
-
       // Define element sizes for dofs/quad
       CeedInt elem_qpts_size = CeedIntPow(Q_1d, dim);
       CeedInt elem_dofs_size = CeedIntPow(P_1d, dim);
 
+      // In CEED_NOTRANSPOSE mode:
+      // d_u is (P^dim x nc), column-major layout (nc = num_comp)
+      // d_v is (Q^dim x nc x dim), column-major layout (nc = num_comp)
+      // In CEED_TRANSPOSE mode, the sizes of d_u and d_v are switched.
+
       // E-vector ordering -------------- Q-vector ordering
       //                                  dim
       //  component                        component
@@ -158,7 +150,7 @@ CEED_INTERN "C"
       //       node                            node
 
       // ---  Define strides for NOTRANSPOSE mode: ---
-      // Input (du) is E-vector, output (dv) is Q-vector
+      // Input (d_u) is E-vector, output (d_v) is Q-vector
 
       // Element strides
       CeedInt u_elem_stride = elem_dofs_size;
@@ -169,10 +161,8 @@ CEED_INTERN "C"
       // Dimension strides
       CeedInt u_dim_stride = 0;
       CeedInt v_dim_stride = num_elem * elem_qpts_size * num_comp;
-
-      // ---  Swap strides for TRANSPOSE mode: ---
       if (t_mode == CEED_TRANSPOSE) {
-        // Input (du) is Q-vector, output (dv) is E-vector
+        // Input (d_u) is Q-vector, output (d_v) is E-vector
         // Element strides
         v_elem_stride = elem_dofs_size;
         u_elem_stride = elem_qpts_size;
@@ -198,30 +188,30 @@ CEED_INTERN "C"
         case 2:
           num_threads = max_P_Q;
           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D);
-          shared_mem += sizeof(CeedScalar) * 2 * P * Q;                  // for sTinterp and sTgrad
-          shared_mem += sizeof(CeedScalar) * num_t_col * (P * max_P_Q);  // for reforming rU we need PxP, and for the intermediate output we need PxQ
+          shared_mem += sizeof(CeedScalar) * 2 * P * Q;  // for sTinterp and sTgrad
+          // for reforming rU we need P x P, and for the intermediate output we need P x Q
+          shared_mem += sizeof(CeedScalar) * num_t_col * (P * max_P_Q);
           break;
         case 3:
           num_threads = max_P_Q * max_P_Q;
           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D);
           shared_mem += sizeof(CeedScalar) * 2 * P * Q;  // for sTinterp and sTgrad
-          shared_mem += sizeof(CeedScalar) * num_t_col *
-                        CeedIntMax(P * P * P,
-                                   (P * P * Q) + (P * Q * Q));  // rU needs P^2xP, the intermediate outputs need (P^2.Q + P.Q^2)
+          // rU needs P^2 x P, the intermediate outputs need (P^2 x Q + P x Q^2)
+          shared_mem += sizeof(CeedScalar) * num_t_col * CeedIntMax(P * P * P, (P * P * Q) + (P * Q * Q));
+          break;
       }
-      CeedInt grid   = (num_elem + num_t_col - 1) / num_t_col;
-      void   *args[] = {&impl->d_interp_1d, &impl->d_grad_1d, &du,           &u_elem_stride, &u_comp_stride, &u_dim_stride, &dv,
+      CeedInt grid   = CeedDivUpInt(num_elem, num_t_col);
+      void   *args[] = {&impl->d_interp_1d, &impl->d_grad_1d, &d_u,          &u_elem_stride, &u_comp_stride, &u_dim_stride, &d_v,
                         &v_elem_stride,     &v_comp_stride,   &v_dim_stride, &num_elem};
 
       if (t_mode == CEED_TRANSPOSE) {
-        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad_tr, grid, num_threads, num_t_col, 1, shared_mem, args));
+        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->GradTranspose, grid, num_threads, num_t_col, 1, shared_mem, args));
       } else {
-        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad, grid, num_threads, num_t_col, 1, shared_mem, args));
+        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Grad, grid, num_threads, num_t_col, 1, shared_mem, args));
       }
     } break;
     case CEED_EVAL_WEIGHT: {
-      CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT inum_compatible with CEED_TRANSPOSE");
-      CeedInt Q              = Q_1d;
+      CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
       CeedInt elem_dofs_size = CeedIntPow(Q, dim);
       CeedInt num_threads    = 1;
       CeedInt num_t_col      = 1;
@@ -243,11 +233,12 @@ CEED_INTERN "C"
           num_threads = Q * Q;
           num_t_col   = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D);
           shared_mem += sizeof(CeedScalar) * Q;  // for d_q_weight_1d
+          break;
       }
-      CeedInt grid   = (num_elem + num_t_col - 1) / num_t_col;
-      void   *args[] = {&impl->d_q_weight_1d, &dv, &elem_dofs_size, &num_elem};
+      CeedInt grid   = CeedDivUpInt(num_elem, num_t_col);
+      void   *args[] = {&impl->d_q_weight_1d, &d_v, &elem_dofs_size, &num_elem};
 
-      CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_weight, grid, num_threads, num_t_col, 1, shared_mem, args));
+      CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, grid, num_threads, num_t_col, 1, shared_mem, args));
     } break;
     // LCOV_EXCL_START
     case CEED_EVAL_DIV:
@@ -259,400 +250,370 @@ CEED_INTERN "C"
       // LCOV_EXCL_STOP
   }
 
-  // must sync to ensure completeness
+  // Must sync to ensure completeness
   ceed_magma_queue_sync(data->queue);
 
+  // Restore vectors
   if (e_mode != CEED_EVAL_WEIGHT) {
-    CeedCallBackend(CeedVectorRestoreArrayRead(U, &du));
+    CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
   }
-  CeedCallBackend(CeedVectorRestoreArray(V, &dv));
+  CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   return CEED_ERROR_SUCCESS;
 }
 
-#ifdef __cplusplus
-CEED_INTERN "C"
-#endif
-    int
-    CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector U, CeedVector V) {
+//------------------------------------------------------------------------------
+// Basis apply - non-tensor
+//------------------------------------------------------------------------------
+static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u,
+                                         CeedVector v) {
   Ceed                      ceed;
   Ceed_Magma               *data;
-  CeedInt                   dim, num_comp, num_dof, num_qpts, NB = 1;
-  const CeedScalar         *du;
-  CeedScalar               *dv;
+  CeedInt                   dim, num_comp, num_nodes, num_qpts, P, Q, N;
+  const CeedScalar         *d_u;
+  CeedScalar               *d_v;
   CeedBasisNonTensor_Magma *impl;
-  CeedMagmaFunction        *interp, *grad;
 
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedGetData(ceed, &data));
-  magma_int_t arch = magma_getdevice_arch();
-
+  CeedCallBackend(CeedBasisGetData(basis, &impl));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedBasisGetNumNodes(basis, &num_dof));
+  CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
   CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
-  CeedInt P = num_dof, Q = num_qpts, N = num_elem * num_comp;
+  P = num_nodes;
+  Q = num_qpts;
+  N = num_elem * num_comp;
 
-  if (U != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du));
+  // Read vectors
+  if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv));
-
-  CeedCallBackend(CeedBasisGetData(basis, &impl));
-
-  CeedDebug256(ceed, 4, "[CeedBasisApplyNonTensor_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, num_comp * num_dof, num_comp);
+  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
+  // Clear v for transpose operation
   if (t_mode == CEED_TRANSPOSE) {
     CeedSize length;
 
-    CeedCallBackend(CeedVectorGetLength(V, &length));
+    CeedCallBackend(CeedVectorGetLength(v, &length));
     if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
-      magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)dv, length, data->queue);
+      magmablas_slaset(MagmaFull, length, 1, 0.0, 0.0, (float *)d_v, length, data->queue);
     } else {
-      magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)dv, length, data->queue);
+      magmablas_dlaset(MagmaFull, length, 1, 0.0, 0.0, (double *)d_v, length, data->queue);
     }
     ceed_magma_queue_sync(data->queue);
   }
 
-  CeedInt n_array[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_N_VALUES};
-  CeedInt iN                                        = 0;
-  CeedInt diff                                      = abs(n_array[iN] - N);
+  // Apply basis operation
+  if (e_mode != CEED_EVAL_WEIGHT) {
+    if (P < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
+      CeedInt n_array[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_KERNEL_N_VALUES};
+      CeedInt iN = 0, diff = abs(n_array[iN] - N), idiff;
+      CeedInt M = (t_mode == CEED_TRANSPOSE) ? P : Q, K = (t_mode == CEED_TRANSPOSE) ? Q : P;
+
+      for (CeedInt in = iN + 1; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
+        idiff = abs(n_array[in] - N);
+        if (idiff < diff) {
+          iN   = in;
+          diff = idiff;
+        }
+      }
 
-  for (CeedInt in = iN + 1; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
-    CeedInt idiff = abs(n_array[in] - N);
-    if (idiff < diff) {
-      iN   = in;
-      diff = idiff;
-    }
-  }
+      // Compile kernels for N as needed
+      if (!impl->NB_interp[iN]) {
+        Ceed        ceed_delegate;
+        char       *interp_kernel_path, *grad_kernel_path, *basis_kernel_source;
+        magma_int_t arch = magma_getdevice_arch();
+
+        // Tuning parameters for NB
+        impl->NB_interp[iN]   = nontensor_rtc_get_nb(arch, 'n', 1, P, Q, n_array[iN]);
+        impl->NB_interp_t[iN] = nontensor_rtc_get_nb(arch, 't', 1, P, Q, n_array[iN]);
+        impl->NB_grad[iN]     = nontensor_rtc_get_nb(arch, 'n', dim, P, Q, n_array[iN]);
+        impl->NB_grad_t[iN]   = nontensor_rtc_get_nb(arch, 't', dim, P, Q, n_array[iN]);
+
+        // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
+        CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
+
+        // Compile kernels
+        CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-interp-nontensor.h", &interp_kernel_path));
+        CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
+        CeedCallBackend(CeedLoadSourceToBuffer(ceed, interp_kernel_path, &basis_kernel_source));
+        CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-grad-nontensor.h", &grad_kernel_path));
+        CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_kernel_path, &basis_kernel_source));
+        CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
+        CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_interp[iN], 7, "BASIS_DIM", dim, "BASIS_P", P, "BASIS_Q",
+                                         Q, "BASIS_NB_INTERP_N", impl->NB_interp[iN], "BASIS_NB_INTERP_T", impl->NB_interp_t[iN], "BASIS_NB_GRAD_N",
+                                         impl->NB_grad[iN], "BASIS_NB_GRAD_T", impl->NB_grad_t[iN]));
+        CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_interp_nontensor_n", &impl->Interp[iN]));
+        CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_interp_nontensor_t", &impl->InterpTranspose[iN]));
+        CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_grad_nontensor_n", &impl->Grad[iN]));
+        CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_grad_nontensor_t", &impl->GradTranspose[iN]));
+        CeedCallBackend(CeedFree(&interp_kernel_path));
+        CeedCallBackend(CeedFree(&grad_kernel_path));
+        CeedCallBackend(CeedFree(&basis_kernel_source));
+      }
 
-  NB     = nontensor_rtc_get_nb(arch, 'd', e_mode, t_mode, P, n_array[iN], Q);
-  interp = (t_mode == CEED_TRANSPOSE) ? &impl->magma_interp_tr_nontensor[iN] : &impl->magma_interp_nontensor[iN];
-  grad   = (t_mode == CEED_TRANSPOSE) ? &impl->magma_grad_tr_nontensor[iN] : &impl->magma_grad_nontensor[iN];
+      // Apply basis operation
+      CeedInt num_t_col = MAGMA_BASIS_NTCOL(M, MAGMA_MAXTHREADS_1D);
+      if (e_mode == CEED_EVAL_INTERP) {
+        CeedInt NB           = (t_mode == CEED_TRANSPOSE) ? impl->NB_interp_t[iN] : impl->NB_interp[iN];
+        CeedInt grid         = CeedDivUpInt(N, NB * num_t_col);
+        CeedInt shared_mem_A = (t_mode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar);
+        CeedInt shared_mem_B = num_t_col * K * NB * sizeof(CeedScalar);
+        CeedInt shared_mem   = (t_mode == CEED_TRANSPOSE) ? (shared_mem_A + shared_mem_B) : CeedIntMax(shared_mem_A, shared_mem_B);
+        void   *args[]       = {&N, &impl->d_interp, &P, &d_u, &K, &d_v, &M};
 
-  switch (e_mode) {
-    case CEED_EVAL_INTERP: {
-      CeedInt P = num_dof, Q = num_qpts;
-      if (P < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
-        CeedInt M          = (t_mode == CEED_TRANSPOSE) ? P : Q;
-        CeedInt K          = (t_mode == CEED_TRANSPOSE) ? Q : P;
-        CeedInt num_t_col  = MAGMA_NONTENSOR_BASIS_NTCOL(M);
-        CeedInt shared_mem = 0, shared_mem_A = 0, shared_mem_B = 0;
-        shared_mem_B += num_t_col * K * NB * sizeof(CeedScalar);
-        shared_mem_A += (t_mode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar);
-        shared_mem = (t_mode == CEED_TRANSPOSE) ? (shared_mem_A + shared_mem_B) : CeedIntMax(shared_mem_A, shared_mem_B);
-
-        CeedInt       grid    = MAGMA_CEILDIV(MAGMA_CEILDIV(N, NB), num_t_col);
-        magma_trans_t trans_A = (t_mode == CEED_TRANSPOSE) ? MagmaNoTrans : MagmaTrans;
-        magma_trans_t trans_B = MagmaNoTrans;
-        CeedScalar    alpha = 1.0, beta = 0.0;
-
-        void *args[] = {&trans_A, &trans_B, &N, &alpha, &impl->d_interp, &P, &du, &K, &beta, &dv, &M};
-        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, *interp, grid, M, num_t_col, 1, shared_mem, args));
-      } else {
         if (t_mode == CEED_TRANSPOSE) {
-          magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, num_elem * num_comp, Q, 1.0, impl->d_interp, P, du, Q, 0.0, dv, P, data->queue);
+          CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->InterpTranspose[iN], grid, M, num_t_col, 1, shared_mem, args));
         } else {
-          magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, num_elem * num_comp, P, 1.0, impl->d_interp, P, du, P, 0.0, dv, Q, data->queue);
+          CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Interp[iN], grid, M, num_t_col, 1, shared_mem, args));
         }
-      }
-    } break;
+      } else if (e_mode == CEED_EVAL_GRAD) {
+        CeedInt NB         = (t_mode == CEED_TRANSPOSE) ? impl->NB_grad_t[iN] : impl->NB_grad[iN];
+        CeedInt grid       = CeedDivUpInt(N, NB * num_t_col);
+        CeedInt shared_mem = num_t_col * K * NB * sizeof(CeedScalar) + ((t_mode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar));
+        void   *args[]     = {&N, &impl->d_grad, &P, &d_u, &K, &d_v, &M};
 
-    case CEED_EVAL_GRAD: {
-      CeedInt P = num_dof, Q = num_qpts;
-      if (P < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
-        CeedInt M          = (t_mode == CEED_TRANSPOSE) ? P : Q;
-        CeedInt K          = (t_mode == CEED_TRANSPOSE) ? Q : P;
-        CeedInt num_t_col  = MAGMA_NONTENSOR_BASIS_NTCOL(M);
-        CeedInt shared_mem = 0, shared_mem_A = 0, shared_mem_B = 0;
-        shared_mem_B += num_t_col * K * NB * sizeof(CeedScalar);
-        shared_mem_A += (t_mode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar);
-        shared_mem = shared_mem_A + shared_mem_B;
-
-        CeedInt       grid    = MAGMA_CEILDIV(MAGMA_CEILDIV(N, NB), num_t_col);
-        magma_trans_t trans_A = (t_mode == CEED_TRANSPOSE) ? MagmaNoTrans : MagmaTrans;
-        magma_trans_t trans_B = MagmaNoTrans;
-
-        void *args[] = {&trans_A, &trans_B, &N, &impl->d_grad, &P, &du, &K, &dv, &M};
-        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, *grad, grid, M, num_t_col, 1, shared_mem, args));
+        if (t_mode == CEED_TRANSPOSE) {
+          CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->GradTranspose[iN], grid, M, num_t_col, 1, shared_mem, args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Grad[iN], grid, M, num_t_col, 1, shared_mem, args));
+        }
       } else {
+        // LCOV_EXCL_START
+        return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV, CEED_EVAL_CURL not supported");
+        // LCOV_EXCL_STOP
+      }
+    } else {
+      if (e_mode == CEED_EVAL_INTERP) {
+        if (t_mode == CEED_TRANSPOSE) {
+          magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, N, Q, 1.0, impl->d_interp, P, d_u, Q, 0.0, d_v, P, data->queue);
+        } else {
+          magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, N, P, 1.0, impl->d_interp, P, d_u, P, 0.0, d_v, Q, data->queue);
+        }
+      } else if (e_mode == CEED_EVAL_GRAD) {
         if (t_mode == CEED_TRANSPOSE) {
-          CeedScalar beta = 0.0;
           for (int d = 0; d < dim; d++) {
-            if (d > 0) beta = 1.0;
-            magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, num_elem * num_comp, Q, 1.0, impl->d_grad + d * P * Q, P,
-                                 du + d * num_elem * num_comp * Q, Q, beta, dv, P, data->queue);
+            const CeedScalar beta = (d > 0) ? 1.0 : 0.0;
+            magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, N, Q, 1.0, impl->d_grad + d * P * Q, P, d_u + d * N * Q, Q, beta, d_v, P,
+                                 data->queue);
           }
         } else {
-          for (int d = 0; d < dim; d++)
-            magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, num_elem * num_comp, P, 1.0, impl->d_grad + d * P * Q, P, du, P, 0.0,
-                                 dv + d * num_elem * num_comp * Q, Q, data->queue);
+          for (int d = 0; d < dim; d++) {
+            magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, N, P, 1.0, impl->d_grad + d * P * Q, P, d_u, P, 0.0, d_v + d * N * Q, Q, data->queue);
+          }
         }
+      } else {
+        // LCOV_EXCL_START
+        return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV, CEED_EVAL_CURL not supported");
+        // LCOV_EXCL_STOP
       }
-    } break;
-
-    case CEED_EVAL_WEIGHT: {
-      CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT inum_compatible with CEED_TRANSPOSE");
-
-      int elemsPerBlock = 1;  // basis->Q_1d < 7 ? optElems[basis->Q_1d] : 1;
-      int grid          = num_elem / elemsPerBlock + ((num_elem / elemsPerBlock * elemsPerBlock < num_elem) ? 1 : 0);
-
-      magma_weight_nontensor(grid, num_qpts, num_elem, num_qpts, impl->d_q_weight, dv, data->queue);
-    } break;
-
-    // LCOV_EXCL_START
-    case CEED_EVAL_DIV:
-      return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
-    case CEED_EVAL_CURL:
-      return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
-    case CEED_EVAL_NONE:
-      return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
-      // LCOV_EXCL_STOP
+    }
+  } else {
+    CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
+    CeedInt num_t_col  = MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D);
+    CeedInt grid       = CeedDivUpInt(num_elem, num_t_col);
+    CeedInt shared_mem = Q * sizeof(CeedScalar) + num_t_col * Q * sizeof(CeedScalar);
+    void   *args[]     = {&num_elem, &impl->d_q_weight, &d_v, &Q};
+
+    CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, grid, Q, num_t_col, 1, shared_mem, args));
   }
 
-  // must sync to ensure completeness
+  // Must sync to ensure completeness
   ceed_magma_queue_sync(data->queue);
 
+  // Restore vectors
   if (e_mode != CEED_EVAL_WEIGHT) {
-    CeedCallBackend(CeedVectorRestoreArrayRead(U, &du));
+    CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
   }
-  CeedCallBackend(CeedVectorRestoreArray(V, &dv));
+  CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   return CEED_ERROR_SUCCESS;
 }
 
-#ifdef __cplusplus
-CEED_INTERN "C"
-#endif
-    int
-    CeedBasisDestroy_Magma(CeedBasis basis) {
+//------------------------------------------------------------------------------
+// Destroy tensor basis
+//------------------------------------------------------------------------------
+static int CeedBasisDestroy_Magma(CeedBasis basis) {
   Ceed             ceed;
   CeedBasis_Magma *impl;
 
-  CeedCallBackend(CeedBasisGetData(basis, &impl));
-  CeedCallBackend(magma_free(impl->d_q_ref_1d));
-  CeedCallBackend(magma_free(impl->d_interp_1d));
-  CeedCallBackend(magma_free(impl->d_grad_1d));
-  CeedCallBackend(magma_free(impl->d_q_weight_1d));
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+  CeedCallBackend(CeedBasisGetData(basis, &impl));
 #ifdef CEED_MAGMA_USE_HIP
   CeedCallHip(ceed, hipModuleUnload(impl->module));
 #else
   CeedCallCuda(ceed, cuModuleUnload(impl->module));
 #endif
+  CeedCallBackend(magma_free(impl->d_interp_1d));
+  CeedCallBackend(magma_free(impl->d_grad_1d));
+  CeedCallBackend(magma_free(impl->d_q_weight_1d));
   CeedCallBackend(CeedFree(&impl));
   return CEED_ERROR_SUCCESS;
 }
 
-#ifdef __cplusplus
-CEED_INTERN "C"
-#endif
-    int
-    CeedBasisDestroyNonTensor_Magma(CeedBasis basis) {
+//------------------------------------------------------------------------------
+// Destroy non-tensor basis
+//------------------------------------------------------------------------------
+static int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) {
   Ceed                      ceed;
   CeedBasisNonTensor_Magma *impl;
 
-  CeedCallBackend(CeedBasisGetData(basis, &impl));
-  CeedCallBackend(magma_free(impl->d_q_ref));
-  CeedCallBackend(magma_free(impl->d_interp));
-  CeedCallBackend(magma_free(impl->d_grad));
-  CeedCallBackend(magma_free(impl->d_q_weight));
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+  CeedCallBackend(CeedBasisGetData(basis, &impl));
 #ifdef CEED_MAGMA_USE_HIP
-  for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
-    CeedCallHip(ceed, hipModuleUnload(impl->module[in]));
-  }
+  CeedCallHip(ceed, hipModuleUnload(impl->module_weight));
 #else
+  CeedCallCuda(ceed, cuModuleUnload(impl->module_weight));
+#endif
   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
-    CeedCallCuda(ceed, cuModuleUnload(impl->module[in]));
-  }
+    if (impl->module_interp[in]) {
+#ifdef CEED_MAGMA_USE_HIP
+      CeedCallHip(ceed, hipModuleUnload(impl->module_interp[in]));
+#else
+      CeedCallCuda(ceed, cuModuleUnload(impl->module_interp[in]));
 #endif
+    }
+  }
+  CeedCallBackend(magma_free(impl->d_interp));
+  CeedCallBackend(magma_free(impl->d_grad));
+  CeedCallBackend(magma_free(impl->d_q_weight));
   CeedCallBackend(CeedFree(&impl));
   return CEED_ERROR_SUCCESS;
 }
 
-#ifdef __cplusplus
-CEED_INTERN "C"
-#endif
-    int
-    CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
+//------------------------------------------------------------------------------
+// Create tensor
+//------------------------------------------------------------------------------
+int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
                                   const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
   Ceed             ceed, ceed_delegate;
   Ceed_Magma      *data;
-  char            *magma_common_path, *interp_path, *grad_path, *weight_path, *basis_kernel_source;
-  CeedInt          num_comp = 0;
+  char            *interp_kernel_path, *grad_kernel_path, *weight_kernel_path, *basis_kernel_source;
+  CeedInt          num_comp;
   CeedBasis_Magma *impl;
 
-  CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
-
-  // Check for supported parameters
-  CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
   CeedCallBackend(CeedGetData(ceed, &data));
+  CeedCallBackend(CeedCalloc(1, &impl));
+
+  // Copy basis data to GPU
+  CeedCallBackend(magma_malloc((void **)&impl->d_q_weight_1d, Q_1d * sizeof(q_weight_1d[0])));
+  magma_setvector(Q_1d, sizeof(q_weight_1d[0]), q_weight_1d, 1, impl->d_q_weight_1d, 1, data->queue);
+  CeedCallBackend(magma_malloc((void **)&impl->d_interp_1d, Q_1d * P_1d * sizeof(interp_1d[0])));
+  magma_setvector(Q_1d * P_1d, sizeof(interp_1d[0]), interp_1d, 1, impl->d_interp_1d, 1, data->queue);
+  CeedCallBackend(magma_malloc((void **)&impl->d_grad_1d, Q_1d * P_1d * sizeof(grad_1d[0])));
+  magma_setvector(Q_1d * P_1d, sizeof(grad_1d[0]), grad_1d, 1, impl->d_grad_1d, 1, data->queue);
+
+  // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
+  CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
 
   // Compile kernels
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_defs.h", &magma_common_path));
+  CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+  {
+    char   *interp_kernel_name_base = "ceed/jit-source/magma/magma-basis-interp";
+    CeedInt interp_kernel_name_len  = strlen(interp_kernel_name_base) + 6;
+    char    interp_kernel_name[interp_kernel_name_len];
+
+    snprintf(interp_kernel_name, interp_kernel_name_len, "%s-%" CeedInt_FMT "d.h", interp_kernel_name_base, dim);
+    CeedCallBackend(CeedGetJitAbsolutePath(ceed, interp_kernel_name, &interp_kernel_path));
+  }
   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, magma_common_path, &basis_kernel_source));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_tensor.h", &magma_common_path));
-  CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, magma_common_path, &basis_kernel_source));
-  char   *interp_name_base = "ceed/jit-source/magma/interp";
-  CeedInt interp_name_len  = strlen(interp_name_base) + 6;
-  char    interp_name[interp_name_len];
-
-  snprintf(interp_name, interp_name_len, "%s-%" CeedInt_FMT "d.h", interp_name_base, dim);
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, interp_name, &interp_path));
-  CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, interp_path, &basis_kernel_source));
-  char   *grad_name_base = "ceed/jit-source/magma/grad";
-  CeedInt grad_name_len  = strlen(grad_name_base) + 6;
-  char    grad_name[grad_name_len];
-
-  snprintf(grad_name, grad_name_len, "%s-%" CeedInt_FMT "d.h", grad_name_base, dim);
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, grad_name, &grad_path));
-  CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_path, &basis_kernel_source));
-  char   *weight_name_base = "ceed/jit-source/magma/weight";
-  CeedInt weight_name_len  = strlen(weight_name_base) + 6;
-  char    weight_name[weight_name_len];
-
-  snprintf(weight_name, weight_name_len, "%s-%" CeedInt_FMT "d.h", weight_name_base, dim);
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, weight_name, &weight_path));
-  CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, weight_path, &basis_kernel_source));
+  CeedCallBackend(CeedLoadSourceToBuffer(ceed, interp_kernel_path, &basis_kernel_source));
+  {
+    char   *grad_kernel_name_base = "ceed/jit-source/magma/magma-basis-grad";
+    CeedInt grad_kernel_name_len  = strlen(grad_kernel_name_base) + 6;
+    char    grad_kernel_name[grad_kernel_name_len];
+
+    snprintf(grad_kernel_name, grad_kernel_name_len, "%s-%" CeedInt_FMT "d.h", grad_kernel_name_base, dim);
+    CeedCallBackend(CeedGetJitAbsolutePath(ceed, grad_kernel_name, &grad_kernel_path));
+  }
+  CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_kernel_path, &basis_kernel_source));
+  {
+    char   *weight_kernel_name_base = "ceed/jit-source/magma/magma-basis-weight";
+    CeedInt weight_kernel_name_len  = strlen(weight_kernel_name_base) + 6;
+    char    weight_kernel_name[weight_kernel_name_len];
+
+    snprintf(weight_kernel_name, weight_kernel_name_len, "%s-%" CeedInt_FMT "d.h", weight_kernel_name_base, dim);
+    CeedCallBackend(CeedGetJitAbsolutePath(ceed, weight_kernel_name, &weight_kernel_path));
+  }
+  CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, weight_kernel_path, &basis_kernel_source));
   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
-  // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip
-  // data
-  CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
-  CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module, 5, "DIM", dim, "NCOMP", num_comp, "P", P_1d, "Q", Q_1d, "MAXPQ",
-                                   CeedIntMax(P_1d, Q_1d)));
-
-  // Kernel setup
+  CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module, 5, "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_P",
+                                   P_1d, "BASIS_Q", Q_1d, "BASIS_MAX_P_Q", CeedIntMax(P_1d, Q_1d)));
   switch (dim) {
     case 1:
-      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", &impl->magma_interp));
-      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", &impl->magma_interp_tr));
-      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", &impl->magma_grad));
-      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", &impl->magma_grad_tr));
-      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", &impl->magma_weight));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", &impl->Interp));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", &impl->InterpTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", &impl->Grad));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", &impl->GradTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", &impl->Weight));
       break;
     case 2:
-      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", &impl->magma_interp));
-      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", &impl->magma_interp_tr));
-      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", &impl->magma_grad));
-      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", &impl->magma_grad_tr));
-      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", &impl->magma_weight));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", &impl->Interp));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", &impl->InterpTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", &impl->Grad));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", &impl->GradTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", &impl->Weight));
       break;
     case 3:
-      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", &impl->magma_interp));
-      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", &impl->magma_interp_tr));
-      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", &impl->magma_grad));
-      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", &impl->magma_grad_tr));
-      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", &impl->magma_weight));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", &impl->Interp));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", &impl->InterpTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", &impl->Grad));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", &impl->GradTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", &impl->Weight));
+      break;
   }
+  CeedCallBackend(CeedFree(&interp_kernel_path));
+  CeedCallBackend(CeedFree(&grad_kernel_path));
+  CeedCallBackend(CeedFree(&weight_kernel_path));
+  CeedCallBackend(CeedFree(&basis_kernel_source));
+
+  CeedCallBackend(CeedBasisSetData(basis, impl));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Magma));
-
-  // Copy q_ref_1d to the GPU
-  CeedCallBackend(magma_malloc((void **)&impl->d_q_ref_1d, Q_1d * sizeof(q_ref_1d[0])));
-  magma_setvector(Q_1d, sizeof(q_ref_1d[0]), q_ref_1d, 1, impl->d_q_ref_1d, 1, data->queue);
-
-  // Copy interp_1d to the GPU
-  CeedCallBackend(magma_malloc((void **)&impl->d_interp_1d, Q_1d * P_1d * sizeof(interp_1d[0])));
-  magma_setvector(Q_1d * P_1d, sizeof(interp_1d[0]), interp_1d, 1, impl->d_interp_1d, 1, data->queue);
-
-  // Copy grad_1d to the GPU
-  CeedCallBackend(magma_malloc((void **)&impl->d_grad_1d, Q_1d * P_1d * sizeof(grad_1d[0])));
-  magma_setvector(Q_1d * P_1d, sizeof(grad_1d[0]), grad_1d, 1, impl->d_grad_1d, 1, data->queue);
-
-  // Copy q_weight_1d to the GPU
-  CeedCallBackend(magma_malloc((void **)&impl->d_q_weight_1d, Q_1d * sizeof(q_weight_1d[0])));
-  magma_setvector(Q_1d, sizeof(q_weight_1d[0]), q_weight_1d, 1, impl->d_q_weight_1d, 1, data->queue);
-
-  CeedCallBackend(CeedBasisSetData(basis, impl));
-  CeedCallBackend(CeedFree(&magma_common_path));
-  CeedCallBackend(CeedFree(&interp_path));
-  CeedCallBackend(CeedFree(&grad_path));
-  CeedCallBackend(CeedFree(&weight_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
   return CEED_ERROR_SUCCESS;
 }
 
-#ifdef __cplusplus
-CEED_INTERN "C"
-#endif
-    int
-    CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_dof, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad,
+//------------------------------------------------------------------------------
+// Create non-tensor H^1
+//------------------------------------------------------------------------------
+int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad,
                             const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
   Ceed                      ceed, ceed_delegate;
   Ceed_Magma               *data;
-  char                     *magma_common_path, *interp_path, *grad_path, *basis_kernel_source;
+  char                     *weight_kernel_path, *basis_kernel_source;
   CeedBasisNonTensor_Magma *impl;
 
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedGetData(ceed, &data));
-  magma_int_t arch = magma_getdevice_arch();
-
   CeedCallBackend(CeedCalloc(1, &impl));
-  // Compile kernels
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_defs.h", &magma_common_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, magma_common_path, &basis_kernel_source));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_nontensor.h", &magma_common_path));
-  CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, magma_common_path, &basis_kernel_source));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/interp-nontensor.h", &interp_path));
-  CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, interp_path, &basis_kernel_source));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/grad-nontensor.h", &grad_path));
-  CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_path, &basis_kernel_source));
-
-  // tuning parameters for nb
-  CeedInt nb_interp_n[MAGMA_NONTENSOR_KERNEL_INSTANCES];
-  CeedInt nb_interp_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
-  CeedInt nb_grad_n[MAGMA_NONTENSOR_KERNEL_INSTANCES];
-  CeedInt nb_grad_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
-  CeedInt P = num_dof, Q = num_qpts;
-  CeedInt n_array[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_N_VALUES};
 
-  for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
-    nb_interp_n[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_INTERP, CEED_NOTRANSPOSE, P, n_array[in], Q);
-    nb_interp_t[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_INTERP, CEED_TRANSPOSE, P, n_array[in], Q);
-    nb_grad_n[in]   = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_GRAD, CEED_NOTRANSPOSE, P, n_array[in], Q);
-    nb_grad_t[in]   = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_GRAD, CEED_TRANSPOSE, P, n_array[in], Q);
-  }
+  // Copy basis data to GPU
+  CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
+  magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
+  CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_nodes * sizeof(interp[0])));
+  magma_setvector(num_qpts * num_nodes, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue);
+  CeedCallBackend(magma_malloc((void **)&impl->d_grad, num_qpts * num_nodes * dim * sizeof(grad[0])));
+  magma_setvector(num_qpts * num_nodes * dim, sizeof(grad[0]), grad, 1, impl->d_grad, 1, data->queue);
 
-  // compile
+  // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
   CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
-  for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
-    CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[in], 7, "DIM", dim, "P", P, "Q", Q, "NB_INTERP_N",
-                                     nb_interp_n[in], "NB_INTERP_T", nb_interp_t[in], "NB_GRAD_N", nb_grad_n[in], "NB_GRAD_T", nb_grad_t[in]));
-  }
 
-  // get kernels
-  for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
-    CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_interp_nontensor_n", &impl->magma_interp_nontensor[in]));
-    CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_interp_nontensor_t", &impl->magma_interp_tr_nontensor[in]));
-    CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_grad_nontensor_n", &impl->magma_grad_nontensor[in]));
-    CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[in], "magma_grad_nontensor_t", &impl->magma_grad_tr_nontensor[in]));
-  }
+  // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply)
+  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path));
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
+  CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source));
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
+  CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_weight, 1, "BASIS_Q", num_qpts));
+  CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_weight, "magma_weight_nontensor", &impl->Weight));
+  CeedCallBackend(CeedFree(&weight_kernel_path));
+  CeedCallBackend(CeedFree(&basis_kernel_source));
+
+  CeedCallBackend(CeedBasisSetData(basis, impl));
 
+  // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
-
-  // Copy q_ref to the GPU
-  CeedCallBackend(magma_malloc((void **)&impl->d_q_ref, num_qpts * sizeof(q_ref[0])));
-  magma_setvector(num_qpts, sizeof(q_ref[0]), q_ref, 1, impl->d_q_ref, 1, data->queue);
-
-  // Copy interp to the GPU
-  CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_dof * sizeof(interp[0])));
-  magma_setvector(num_qpts * num_dof, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue);
-
-  // Copy grad to the GPU
-  CeedCallBackend(magma_malloc((void **)&impl->d_grad, num_qpts * num_dof * dim * sizeof(grad[0])));
-  magma_setvector(num_qpts * num_dof * dim, sizeof(grad[0]), grad, 1, impl->d_grad, 1, data->queue);
-
-  // Copy q_weight to the GPU
-  CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
-  magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
-
-  CeedCallBackend(CeedBasisSetData(basis, impl));
-  CeedCallBackend(CeedFree(&magma_common_path));
-  CeedCallBackend(CeedFree(&interp_path));
-  CeedCallBackend(CeedFree(&grad_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
   return CEED_ERROR_SUCCESS;
 }
+
+//------------------------------------------------------------------------------
diff --git a/backends/magma/ceed-magma-common.c b/backends/magma/ceed-magma-common.c
index dcd7598913..e995d00cd3 100644
--- a/backends/magma/ceed-magma-common.c
+++ b/backends/magma/ceed-magma-common.c
@@ -16,10 +16,10 @@
 // Device information backend init
 //------------------------------------------------------------------------------
 int CeedInit_Magma_common(Ceed ceed, const char *resource) {
+  Ceed_Magma *data;
   const char *device_spec = strstr(resource, ":device_id=");
   const int   device_id   = (device_spec) ? atoi(device_spec + 11) : -1;
   int         current_device_id;
-  Ceed_Magma *data;
 
   CeedCallBackend(magma_init());
 
@@ -28,6 +28,7 @@ int CeedInit_Magma_common(Ceed ceed, const char *resource) {
     magma_setdevice(device_id);
     current_device_id = device_id;
   }
+
   CeedCallBackend(CeedGetData(ceed, &data));
   data->device_id = current_device_id;
 #ifdef CEED_MAGMA_USE_HIP
diff --git a/backends/magma/magma_gemm_nontensor.c b/backends/magma/ceed-magma-gemm-nontensor.cpp
similarity index 76%
rename from backends/magma/magma_gemm_nontensor.c
rename to backends/magma/ceed-magma-gemm-nontensor.cpp
index e3b600ae49..257f23ab1d 100644
--- a/backends/magma/magma_gemm_nontensor.c
+++ b/backends/magma/ceed-magma-gemm-nontensor.cpp
@@ -5,7 +5,8 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include "ceed-magma.h"
+#include "ceed-magma-gemm-nontensor.h"
+#include "ceed-magma-gemm-selector.h"
 
 #ifdef CEED_MAGMA_USE_HIP
 #define devblasDgemmStridedBatched hipblasDgemmStridedBatched
@@ -20,9 +21,9 @@
 #endif
 
 ////////////////////////////////////////////////////////////////////////////////
-static int magmablas_gemm(magma_trans_t trans_A, magma_trans_t trans_B, magma_int_t m, magma_int_t n, magma_int_t k, CeedScalar alpha,
-                          const CeedScalar *d_A, magma_int_t ldda, const CeedScalar *d_B, magma_int_t lddb, CeedScalar beta, CeedScalar *d_C,
-                          magma_int_t lddc, magma_queue_t queue) {
+static inline int magmablas_gemm(magma_trans_t trans_A, magma_trans_t trans_B, magma_int_t m, magma_int_t n, magma_int_t k, CeedScalar alpha,
+                                 const CeedScalar *d_A, magma_int_t ldda, const CeedScalar *d_B, magma_int_t lddb, CeedScalar beta, CeedScalar *d_C,
+                                 magma_int_t lddc, magma_queue_t queue) {
   if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
     magmablas_sgemm(trans_A, trans_B, m, n, k, (float)alpha, (const float *)d_A, ldda, (const float *)d_B, lddb, (float)beta, (float *)d_C, lddc,
                     queue);
@@ -34,10 +35,10 @@ static int magmablas_gemm(magma_trans_t trans_A, magma_trans_t trans_B, magma_in
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-static int magmablas_gemm_batched_strided(magma_trans_t trans_A, magma_trans_t trans_B, magma_int_t m, magma_int_t n, magma_int_t k, CeedScalar alpha,
-                                          const CeedScalar *d_A, magma_int_t ldda, magma_int_t strideA, const CeedScalar *d_B, magma_int_t lddb,
-                                          magma_int_t strideB, CeedScalar beta, CeedScalar *d_C, magma_int_t lddc, magma_int_t strideC,
-                                          magma_int_t batchCount, magma_queue_t queue) {
+static inline int magmablas_gemm_batched_strided(magma_trans_t trans_A, magma_trans_t trans_B, magma_int_t m, magma_int_t n, magma_int_t k,
+                                                 CeedScalar alpha, const CeedScalar *d_A, magma_int_t ldda, magma_int_t strideA,
+                                                 const CeedScalar *d_B, magma_int_t lddb, magma_int_t strideB, CeedScalar beta, CeedScalar *d_C,
+                                                 magma_int_t lddc, magma_int_t strideC, magma_int_t batchCount, magma_queue_t queue) {
   if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
     magmablas_sgemm_batched_strided(trans_A, trans_B, m, n, k, (float)alpha, (const float *)d_A, ldda, strideA, (const float *)d_B, lddb, strideB,
                                     (float)beta, (float *)d_C, lddc, strideC, batchCount, queue);
@@ -49,9 +50,9 @@ static int magmablas_gemm_batched_strided(magma_trans_t trans_A, magma_trans_t t
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-static int devblas_gemm(magma_trans_t trans_A, magma_trans_t trans_B, magma_int_t m, magma_int_t n, magma_int_t k, CeedScalar alpha,
-                        const CeedScalar *d_A, magma_int_t ldda, const CeedScalar *d_B, magma_int_t lddb, CeedScalar beta, CeedScalar *d_C,
-                        magma_int_t lddc, magma_queue_t queue) {
+static inline int devblas_gemm(magma_trans_t trans_A, magma_trans_t trans_B, magma_int_t m, magma_int_t n, magma_int_t k, CeedScalar alpha,
+                               const CeedScalar *d_A, magma_int_t ldda, const CeedScalar *d_B, magma_int_t lddb, CeedScalar beta, CeedScalar *d_C,
+                               magma_int_t lddc, magma_queue_t queue) {
   if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
     magma_sgemm(trans_A, trans_B, m, n, k, (float)alpha, (const float *)d_A, ldda, (const float *)d_B, lddb, (float)beta, (float *)d_C, lddc, queue);
   } else {
@@ -62,10 +63,10 @@ static int devblas_gemm(magma_trans_t trans_A, magma_trans_t trans_B, magma_int_
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-static int devblas_gemm_batched_strided(magma_trans_t trans_A, magma_trans_t trans_B, magma_int_t m, magma_int_t n, magma_int_t k, CeedScalar alpha,
-                                        const CeedScalar *d_A, magma_int_t ldda, magma_int_t strideA, const CeedScalar *d_B, magma_int_t lddb,
-                                        magma_int_t strideB, CeedScalar beta, CeedScalar *d_C, magma_int_t lddc, magma_int_t strideC,
-                                        magma_int_t batchCount, magma_queue_t queue) {
+static inline int devblas_gemm_batched_strided(magma_trans_t trans_A, magma_trans_t trans_B, magma_int_t m, magma_int_t n, magma_int_t k,
+                                               CeedScalar alpha, const CeedScalar *d_A, magma_int_t ldda, magma_int_t strideA, const CeedScalar *d_B,
+                                               magma_int_t lddb, magma_int_t strideB, CeedScalar beta, CeedScalar *d_C, magma_int_t lddc,
+                                               magma_int_t strideC, magma_int_t batchCount, magma_queue_t queue) {
   if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
     devblasSgemmStridedBatched(magma_queue_get_devblas_handle(queue), devblas_trans_const(trans_A), devblas_trans_const(trans_B), (int)m, (int)n,
                                (int)k, (const float *)&alpha, (const float *)d_A, (int)ldda, strideA, (const float *)d_B, (int)lddb, strideB,
diff --git a/backends/magma/ceed-magma-gemm-nontensor.h b/backends/magma/ceed-magma-gemm-nontensor.h
new file mode 100644
index 0000000000..d7e83a5fa3
--- /dev/null
+++ b/backends/magma/ceed-magma-gemm-nontensor.h
@@ -0,0 +1,18 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#ifndef CEED_MAGMA_GEMM_NONTENSOR_H
+#define CEED_MAGMA_GEMM_NONTENSOR_H
+
+#include "ceed-magma.h"
+
+////////////////////////////////////////////////////////////////////////////////
+CEED_INTERN int magma_gemm_nontensor(magma_trans_t trans_A, magma_trans_t trans_B, magma_int_t m, magma_int_t n, magma_int_t k, CeedScalar alpha,
+                                     const CeedScalar *d_A, magma_int_t ldda, const CeedScalar *d_B, magma_int_t lddb, CeedScalar beta,
+                                     CeedScalar *d_C, magma_int_t lddc, magma_queue_t queue);
+
+#endif  // CEED_MAGMA_GEMM_NONTENSOR_H
diff --git a/backends/magma/ceed-magma-gemm-selector.cpp b/backends/magma/ceed-magma-gemm-selector.cpp
new file mode 100644
index 0000000000..6f631ef987
--- /dev/null
+++ b/backends/magma/ceed-magma-gemm-selector.cpp
@@ -0,0 +1,139 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <array>
+#include <limits>
+#include <vector>
+
+#include "ceed-magma-gemm-selector.h"
+
+#include "tuning/indices.h"
+#ifdef CEED_MAGMA_USE_HIP
+#include "tuning/mi100.h"
+#include "tuning/mi250x.h"
+#include "tuning/mi250x_grad_rtc.h"
+#include "tuning/mi250x_interp_rtc.h"
+#else
+#include "tuning/a100.h"
+#include "tuning/a100_grad_rtc.h"
+#include "tuning/a100_interp_rtc.h"
+#include "tuning/v100.h"
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+#ifdef CEED_MAGMA_USE_HIP
+static inline auto gemm_selector_get_data(int gpu_arch, char precision, char trans_A) -> decltype(dgemm_nn_mi250x) {
+  if (gpu_arch >= 910) {
+    // gfx90a or newer
+    return (precision == 's') ? ((trans_A == 'n') ? sgemm_nn_mi250x : sgemm_tn_mi250x) : ((trans_A == 'n') ? dgemm_nn_mi250x : dgemm_tn_mi250x);
+  } else {
+    // gfx908 or older
+    return (precision == 's') ? ((trans_A == 'n') ? sgemm_nn_mi100 : sgemm_tn_mi100) : ((trans_A == 'n') ? dgemm_nn_mi100 : dgemm_tn_mi100);
+  }
+}
+#else
+static inline auto gemm_selector_get_data(int gpu_arch, char precision, char trans_A) -> decltype(dgemm_nn_a100) {
+  if (gpu_arch >= 800) {
+    // sm80 or newer
+    return (precision == 's') ? ((trans_A == 'n') ? sgemm_nn_a100 : sgemm_tn_a100) : ((trans_A == 'n') ? dgemm_nn_a100 : dgemm_tn_a100);
+  } else {
+    // sm70 or older
+    return (precision == 's') ? ((trans_A == 'n') ? sgemm_nn_v100 : sgemm_tn_v100) : ((trans_A == 'n') ? dgemm_nn_v100 : dgemm_tn_v100);
+  }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+void gemm_selector(int gpu_arch, char precision, char trans_A, int m, int n, int k, int *n_batch, int *use_magma) {
+  const auto &data = gemm_selector_get_data(gpu_arch, precision, trans_A);
+  int         ir   = -1;
+  double      norm = std::numeric_limits<double>::max();
+
+  for (size_t i = 0; i < data.size(); i++) {
+    const int &im = data[i][M_INDEX];
+    const int &in = data[i][N_INDEX];
+    const int &ik = data[i][K_INDEX];
+
+    double mdiff = (double)(im - m);
+    double ndiff = (double)(in - n);
+    double kdiff = (double)(ik - k);
+    double nrm   = mdiff * mdiff + ndiff * ndiff + kdiff * kdiff;
+
+    if (nrm < norm) {
+      norm = nrm;
+      ir   = i;
+    }
+
+    if (im == m && in == n && ik == k) {
+      // The input (m, n, k) exactly matches a record in `data`, no need to search further
+      break;
+    }
+  }
+
+  if (ir >= 0) {
+    // If the closest match indicates that n = n_batch, that means calling the regular non-batch GEMM.
+    // So n_batch is set to n instead of the 'n_batch' entry of the matching record.
+    int n_       = data[ir][N_INDEX];
+    int n_batch_ = data[ir][N_BATCH_INDEX];
+    *n_batch     = (n_ == n_batch_) ? n : n_batch_;
+    *use_magma   = data[ir][USE_MAGMA_INDEX];
+  } else {
+    *n_batch   = n;
+    *use_magma = 0;
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+#ifdef CEED_MAGMA_USE_HIP
+static inline auto nontensor_rtc_get_data(int gpu_arch, char trans_A, int q_comp) -> decltype(dinterp_n_mi250x) {
+  if (q_comp == 1) {
+    return (trans_A == 'n') ? dinterp_n_mi250x : dinterp_t_mi250x;
+  } else {
+    return (trans_A == 'n') ? dgrad_n_mi250x : dgrad_t_mi250x;
+  }
+}
+#else
+static inline auto nontensor_rtc_get_data(int gpu_arch, char trans_A, int q_comp) -> decltype(dinterp_n_a100) {
+  if (q_comp == 1) {
+    return (trans_A == 'n') ? dinterp_n_a100 : dinterp_t_a100;
+  } else {
+    return (trans_A == 'n') ? dgrad_n_a100 : dgrad_t_a100;
+  }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+CeedInt nontensor_rtc_get_nb(int gpu_arch, char trans_A, int q_comp, int P, int Q, int n) {
+  const auto &data = nontensor_rtc_get_data(gpu_arch, trans_A, q_comp);
+  int         ir   = -1;
+  double      norm = std::numeric_limits<double>::max();
+  CeedInt     m    = (trans_A == 'n') ? Q : P;
+  CeedInt     k    = (trans_A == 'n') ? P : Q;
+
+  for (size_t i = 0; i < data.size(); i++) {
+    const int &im = data[i][M_INDEX_RTC];
+    const int &in = data[i][N_INDEX_RTC];
+    const int &ik = data[i][K_INDEX_RTC];
+
+    double mdiff = (double)(im - m);
+    double ndiff = (double)(in - n);
+    double kdiff = (double)(ik - k);
+    double nrm   = mdiff * mdiff + ndiff * ndiff + kdiff * kdiff;
+
+    if (nrm < norm) {
+      norm = nrm;
+      ir   = i;
+    }
+
+    if (im == m && in == n && ik == k) {
+      // The input (m, n, k) exactly matches a record in `data`, no need to search further
+      break;
+    }
+  }
+
+  return (ir >= 0) ? data[ir][NB_INDEX_RTC] : 1;
+}
diff --git a/backends/magma/ceed-magma-gemm-selector.h b/backends/magma/ceed-magma-gemm-selector.h
new file mode 100644
index 0000000000..ce169b051b
--- /dev/null
+++ b/backends/magma/ceed-magma-gemm-selector.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#ifndef CEED_MAGMA_GEMM_SELECTOR_H
+#define CEED_MAGMA_GEMM_SELECTOR_H
+
+#include "ceed-magma.h"
+
+////////////////////////////////////////////////////////////////////////////////
+CEED_INTERN void gemm_selector(int gpu_arch, char precision, char trans_A, int m, int n, int k, int *n_batch, int *use_magma);
+
+////////////////////////////////////////////////////////////////////////////////
+CEED_INTERN CeedInt nontensor_rtc_get_nb(int gpu_arch, char trans_A, int q_comp, int P, int Q, int n);
+
+#endif  // CEED_MAGMA_GEMM_SELECTOR_H
diff --git a/backends/magma/ceed-magma.h b/backends/magma/ceed-magma.h
index fcba887fe3..b5495a61bb 100644
--- a/backends/magma/ceed-magma.h
+++ b/backends/magma/ceed-magma.h
@@ -16,24 +16,15 @@
 #define MAGMA_MAXTHREADS_1D 128
 #define MAGMA_MAXTHREADS_2D 128
 #define MAGMA_MAXTHREADS_3D 64
-#define MAGMA_NONTENSOR_MAXTHREADS (128)
 
-// Define macro for determining number of threads in y-direction
-// for basis kernels
+// Define macro for determining number of threads in y-direction for basis kernels
 #define MAGMA_BASIS_NTCOL(x, maxt) (((maxt) < (x)) ? 1 : ((maxt) / (x)))
-#define MAGMA_NONTENSOR_BASIS_NTCOL(N) (CeedIntMax(1, (MAGMA_NONTENSOR_MAXTHREADS / (N))))
-#define MAGMA_CEILDIV(A, B) (((A) + (B)-1) / (B))
 
-#define MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P (40)
-#define MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q (40)
-
-// Define macro for computing the total threads in a block
-// for use with __launch_bounds__()
-#define MAGMA_BASIS_BOUNDS(x, maxt) (x * MAGMA_BASIS_NTCOL(x, maxt))
-
-// Define macro for non-tensor kernel instances
-#define MAGMA_NONTENSOR_KERNEL_INSTANCES (5)
-#define MAGMA_NONTENSOR_N_VALUES 10240, 51200, 102400, 512000, 1024000
+// Define macros for non-tensor kernel instances
+#define MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P 40
+#define MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q 40
+#define MAGMA_NONTENSOR_KERNEL_INSTANCES 5
+#define MAGMA_NONTENSOR_KERNEL_N_VALUES 10240, 51200, 102400, 512000, 1024000
 
 #ifdef CEED_MAGMA_USE_HIP
 typedef hipModule_t   CeedMagmaModule;
@@ -55,48 +46,38 @@ typedef CUfunction CeedMagmaFunction;
 
 typedef struct {
   CeedMagmaModule   module;
-  CeedMagmaFunction magma_interp;
-  CeedMagmaFunction magma_interp_tr;
-  CeedMagmaFunction magma_grad;
-  CeedMagmaFunction magma_grad_tr;
-  CeedMagmaFunction magma_weight;
-  CeedScalar       *d_q_ref_1d;
+  CeedMagmaFunction Interp;
+  CeedMagmaFunction InterpTranspose;
+  CeedMagmaFunction Grad;
+  CeedMagmaFunction GradTranspose;
+  CeedMagmaFunction Weight;
   CeedScalar       *d_interp_1d;
   CeedScalar       *d_grad_1d;
   CeedScalar       *d_q_weight_1d;
 } CeedBasis_Magma;
 
 typedef struct {
-  CeedMagmaModule   module[MAGMA_NONTENSOR_KERNEL_INSTANCES];
-  CeedMagmaFunction magma_interp_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES];
-  CeedMagmaFunction magma_interp_tr_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES];
-  CeedMagmaFunction magma_grad_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES];
-  CeedMagmaFunction magma_grad_tr_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES];
-  CeedScalar       *d_q_ref;
+  CeedMagmaModule   module_weight, module_interp[MAGMA_NONTENSOR_KERNEL_INSTANCES];
+  CeedMagmaFunction Interp[MAGMA_NONTENSOR_KERNEL_INSTANCES];
+  CeedMagmaFunction InterpTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES];
+  CeedMagmaFunction Grad[MAGMA_NONTENSOR_KERNEL_INSTANCES];
+  CeedMagmaFunction GradTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES];
+  CeedMagmaFunction Weight;
+  CeedInt           NB_interp[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_interp_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
+  CeedInt           NB_grad[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_grad_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedScalar       *d_interp;
   CeedScalar       *d_grad;
   CeedScalar       *d_q_weight;
 } CeedBasisNonTensor_Magma;
 
-CEED_INTERN void magma_weight_nontensor(magma_int_t grid, magma_int_t threads, magma_int_t num_elem, magma_int_t Q, CeedScalar *d_q_weight,
-                                        CeedScalar *d_v, magma_queue_t queue);
-
-CEED_INTERN int magma_gemm_nontensor(magma_trans_t trans_A, magma_trans_t trans_B, magma_int_t m, magma_int_t n, magma_int_t k, CeedScalar alpha,
-                                     const CeedScalar *d_A, magma_int_t ldda, const CeedScalar *d_B, magma_int_t lddb, CeedScalar beta,
-                                     CeedScalar *d_C, magma_int_t lddc, magma_queue_t queue);
-
-CEED_INTERN void gemm_selector(int gpu_arch, char precision, char trans_A, int m, int n, int k, int *n_batch, int *use_magma);
-
-CEED_INTERN CeedInt nontensor_rtc_get_nb(int gpu_arch, char precision, CeedEvalMode e_mode, CeedTransposeMode t_mode, int P_, int N, int Q_);
-
-CEED_INTERN magma_int_t magma_isdevptr(const void *A);
-
 CEED_INTERN int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
                                               const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis);
 
-CEED_INTERN int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_dof, CeedInt num_qpts, const CeedScalar *interp,
+CEED_INTERN int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
                                         const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis);
 
+CEED_INTERN magma_int_t magma_isdevptr(const void *);
+
 // Comment the line below to use the default magma_is_devptr function
 #define magma_is_devptr magma_isdevptr
 
diff --git a/backends/magma/gemm_selector.cpp b/backends/magma/gemm_selector.cpp
deleted file mode 100644
index ee45bb2f5c..0000000000
--- a/backends/magma/gemm_selector.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-#include <stdio.h>
-#include <sys/time.h>
-
-#include <array>
-#include <limits>
-#include <vector>
-
-#include "./gemm_tuning/indices.h"
-#include "ceed-magma.h"
-#ifdef CEED_MAGMA_USE_HIP
-#include "./gemm_tuning/mi100.h"
-#include "./gemm_tuning/mi250x.h"
-#include "./gemm_tuning/mi250x_grad_rtc.h"
-#include "./gemm_tuning/mi250x_interp_rtc.h"
-#else
-#include "./gemm_tuning/a100.h"
-#include "./gemm_tuning/a100_grad_rtc.h"
-#include "./gemm_tuning/a100_interp_rtc.h"
-#include "./gemm_tuning/v100.h"
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-static void *gemm_selector_get_data(int gpu_arch, char precision, char trans_A) {
-// a default
-#ifdef CEED_MAGMA_USE_HIP
-  void *data = (void *)&sgemm_nn_mi250x;
-#else
-  void *data = (void *)&sgemm_nn_a100;
-#endif
-
-#ifdef CEED_MAGMA_USE_HIP
-  if (gpu_arch >= 910) {
-    // gfx90a or newer
-    data = (precision == 's') ? ((trans_A == 'n') ? (void *)&sgemm_nn_mi250x : (void *)&sgemm_tn_mi250x)
-                              : ((trans_A == 'n') ? (void *)&dgemm_nn_mi250x : (void *)&dgemm_tn_mi250x);
-  } else {
-    // gfx908 or older
-    data = (precision == 's') ? ((trans_A == 'n') ? (void *)&sgemm_nn_mi100 : (void *)&sgemm_tn_mi100)
-                              : ((trans_A == 'n') ? (void *)&dgemm_nn_mi100 : (void *)&dgemm_tn_mi100);
-  }
-#else
-  if (gpu_arch >= 800) {
-    // sm80 or newer
-    data = (precision == 's') ? ((trans_A == 'n') ? (void *)&sgemm_nn_a100 : (void *)&sgemm_tn_a100)
-                              : ((trans_A == 'n') ? (void *)&dgemm_nn_a100 : (void *)&dgemm_tn_a100);
-  } else {
-    // sm70 or older
-    data = (precision == 's') ? ((trans_A == 'n') ? (void *)&sgemm_nn_v100 : (void *)&sgemm_tn_v100)
-                              : ((trans_A == 'n') ? (void *)&dgemm_nn_v100 : (void *)&dgemm_tn_v100);
-  }
-#endif
-
-  return data;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-void gemm_selector(int gpu_arch, char precision, char trans_A, int m, int n, int k, int *n_batch, int *use_magma) {
-  // defaults
-  *n_batch                                           = n;
-  *use_magma                                         = 0;
-  std::vector<std::array<int, RECORD_LENGTH> > *data = NULL;
-  data = (std::vector<std::array<int, RECORD_LENGTH> > *)gemm_selector_get_data(gpu_arch, precision, trans_A);
-
-  int    ir   = -1;
-  double norm = std::numeric_limits<double>::max();
-  for (size_t i = 0; i < data->size(); i++) {
-    int im = (*data)[i][M_INDEX];
-    int in = (*data)[i][N_INDEX];
-    int ik = (*data)[i][K_INDEX];
-
-    double mdiff = (double)(im - m);
-    double ndiff = (double)(in - n);
-    double kdiff = (double)(ik - k);
-
-    double nrm = sqrt(mdiff * mdiff + ndiff * ndiff + kdiff * kdiff);
-
-    if (nrm < norm) {
-      norm = nrm;
-      ir   = i;
-    }
-
-    if (nrm == 0) {
-      // the input (m, n, k) exactly matches a record in `data`
-      // no need to search further
-      break;
-    }
-  }
-
-  if (ir >= 0) {
-    *use_magma = (*data)[ir][USE_MAGMA_INDEX];
-    // if the closest match indicates that n = n_batch,
-    // that means calling the regular non-batch gemm.
-    // So n_batch is set to n instead of the 'n_batch'
-    // entry of the matching record
-    int n_       = (*data)[ir][N_INDEX];
-    int n_batch_ = (*data)[ir][N_BATCH_INDEX];
-    *n_batch     = (n_ == n_batch_) ? n : n_batch_;
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-static void *nontensor_rtc_get_data(int gpu_arch, char precision, CeedEvalMode e_mode, CeedTransposeMode t_mode) {
-// a default
-#ifdef CEED_MAGMA_USE_HIP
-  void *data = (void *)&dinterp_n_mi250x;
-#else
-  void *data = (void *)&dinterp_n_a100;
-#endif
-
-#ifdef CEED_MAGMA_USE_HIP
-  if (e_mode == CEED_EVAL_INTERP) {
-    data = (t_mode == CEED_TRANSPOSE) ? (void *)&dinterp_t_mi250x : (void *)&dinterp_n_mi250x;
-  } else if (e_mode == CEED_EVAL_GRAD) {
-    data = (t_mode == CEED_TRANSPOSE) ? (void *)&dgrad_t_mi250x : (void *)&dgrad_n_mi250x;
-  }
-#else
-  if (e_mode == CEED_EVAL_INTERP) {
-    data = (t_mode == CEED_TRANSPOSE) ? (void *)&dinterp_t_a100 : (void *)&dinterp_n_a100;
-  } else if (e_mode == CEED_EVAL_GRAD) {
-    data = (t_mode == CEED_TRANSPOSE) ? (void *)&dgrad_t_a100 : (void *)&dgrad_n_a100;
-  }
-#endif
-
-  return data;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-CeedInt nontensor_rtc_get_nb(int gpu_arch, char precision, CeedEvalMode e_mode, CeedTransposeMode t_mode, int P_, int N, int Q_) {
-  CeedInt P  = (t_mode == CEED_TRANSPOSE) ? P_ : Q_;
-  CeedInt Q  = (t_mode == CEED_TRANSPOSE) ? Q_ : P_;
-  CeedInt NB = 1;
-
-  std::vector<std::array<int, RECORD_LENGTH_RTC> > *data = NULL;
-  data = (std::vector<std::array<int, RECORD_LENGTH_RTC> > *)nontensor_rtc_get_data(gpu_arch, precision, e_mode, t_mode);
-
-  int    ir   = -1;
-  double norm = std::numeric_limits<double>::max();
-  for (size_t i = 0; i < data->size(); i++) {
-    int ip = (*data)[i][M_INDEX_RTC];
-    int in = (*data)[i][N_INDEX_RTC];
-    int iq = (*data)[i][K_INDEX_RTC];
-
-    double pdiff = (double)(ip - P);
-    double ndiff = (double)(in - N);
-    double qdiff = (double)(iq - Q);
-    double nrm   = sqrt(pdiff * pdiff + ndiff * ndiff + qdiff * qdiff);
-
-    if (nrm < norm) {
-      norm = nrm;
-      ir   = i;
-    }
-
-    if (nrm == 0) {
-      // the input (m, n, k) exactly matches a record in `data`
-      // no need to search further
-      break;
-    }
-  }
-
-  if (ir >= 0) {
-    NB = (*data)[ir][NB_INDEX_RTC];
-  }
-
-  return NB;
-}
diff --git a/backends/magma/kernels/common/weight.h b/backends/magma/kernels/common/weight.h
deleted file mode 100644
index 6bda3258e3..0000000000
--- a/backends/magma/kernels/common/weight.h
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_MAGMA_WEIGHT_H
-#define CEED_MAGMA_WEIGHT_H
-
-#include <ceed.h>
-
-#include "magma_v2.h"
-
-//////////////////////////////////////////////////////////////////////////////////////////
-static __global__ void magma_weight_nontensor_kernel(const CeedInt nelem, const CeedInt Q, const CeedScalar *__restrict__ qweight,
-                                                     CeedScalar *__restrict__ d_V) {
-  const int tid = threadIdx.x;
-  // TODO load qweight in shared memory if blockDim.z > 1?
-  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x * blockDim.z) {
-    d_V[elem * Q + tid] = qweight[tid];
-  }
-}
-
-#endif  // CEED_MAGMA_WEIGHT_H
diff --git a/backends/magma/kernels/cuda/weight_generic.cu b/backends/magma/kernels/cuda/weight_generic.cu
deleted file mode 100644
index 567e3c3c02..0000000000
--- a/backends/magma/kernels/cuda/weight_generic.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../common/weight.h"
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// NonTensor weight function
-extern "C" void 
-magma_weight_nontensor(magma_int_t grid, magma_int_t threads, magma_int_t nelem, magma_int_t Q, 
-             CeedScalar *dqweight, CeedScalar *dv, magma_queue_t queue)
-{
-    magma_weight_nontensor_kernel<<<grid, threads, 0, magma_queue_get_cuda_stream(queue)>>>(nelem, Q, dqweight, dv);
-}
diff --git a/backends/magma/kernels/hip/weight_generic.hip.cpp b/backends/magma/kernels/hip/weight_generic.hip.cpp
deleted file mode 100644
index 24fd090239..0000000000
--- a/backends/magma/kernels/hip/weight_generic.hip.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../common/weight.h"
-#include "hip/hip_runtime.h"
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// NonTensor weight function
-extern "C" void magma_weight_nontensor(magma_int_t grid, magma_int_t threads, magma_int_t nelem, magma_int_t Q, CeedScalar *dqweight, CeedScalar *dv,
-                                       magma_queue_t queue) {
-  hipLaunchKernelGGL(magma_weight_nontensor_kernel, dim3(grid), dim3(threads), 0, magma_queue_get_hip_stream(queue), nelem, Q, dqweight, dv);
-}
diff --git a/backends/magma/gemm_tuning/a100.h b/backends/magma/tuning/a100.h
similarity index 100%
rename from backends/magma/gemm_tuning/a100.h
rename to backends/magma/tuning/a100.h
diff --git a/backends/magma/gemm_tuning/a100_grad_rtc.h b/backends/magma/tuning/a100_grad_rtc.h
similarity index 100%
rename from backends/magma/gemm_tuning/a100_grad_rtc.h
rename to backends/magma/tuning/a100_grad_rtc.h
diff --git a/backends/magma/gemm_tuning/a100_interp_rtc.h b/backends/magma/tuning/a100_interp_rtc.h
similarity index 100%
rename from backends/magma/gemm_tuning/a100_interp_rtc.h
rename to backends/magma/tuning/a100_interp_rtc.h
diff --git a/backends/magma/gemm_tuning/indices.h b/backends/magma/tuning/indices.h
similarity index 100%
rename from backends/magma/gemm_tuning/indices.h
rename to backends/magma/tuning/indices.h
diff --git a/backends/magma/gemm_tuning/mi100.h b/backends/magma/tuning/mi100.h
similarity index 100%
rename from backends/magma/gemm_tuning/mi100.h
rename to backends/magma/tuning/mi100.h
diff --git a/backends/magma/gemm_tuning/mi250x.h b/backends/magma/tuning/mi250x.h
similarity index 100%
rename from backends/magma/gemm_tuning/mi250x.h
rename to backends/magma/tuning/mi250x.h
diff --git a/backends/magma/gemm_tuning/mi250x_grad_rtc.h b/backends/magma/tuning/mi250x_grad_rtc.h
similarity index 100%
rename from backends/magma/gemm_tuning/mi250x_grad_rtc.h
rename to backends/magma/tuning/mi250x_grad_rtc.h
diff --git a/backends/magma/gemm_tuning/mi250x_interp_rtc.h b/backends/magma/tuning/mi250x_interp_rtc.h
similarity index 100%
rename from backends/magma/gemm_tuning/mi250x_interp_rtc.h
rename to backends/magma/tuning/mi250x_interp_rtc.h
diff --git a/backends/magma/gemm_tuning/v100.h b/backends/magma/tuning/v100.h
similarity index 100%
rename from backends/magma/gemm_tuning/v100.h
rename to backends/magma/tuning/v100.h
diff --git a/backends/magma/tuning/v100_rtc-b.h b/backends/magma/tuning/v100_rtc-b.h
new file mode 100644
index 0000000000..51174bc4f0
--- /dev/null
+++ b/backends/magma/tuning/v100_rtc-b.h
@@ -0,0 +1,318 @@
+////////////////////////////////////////////////////////////////////////////////
+// auto-generated from data on v100
+
+////////////////////////////////////////////////////////////////////////////////
+std::vector<std::array<int, RECORD_LENGTH_RTC> > drtc_t_v100 = {
+    {3,  1,  1024,    1, 1 },
+    {3,  1,  1024,    2, 1 },
+    {3,  1,  5120,    1, 1 },
+    {3,  1,  5120,    2, 1 },
+    {3,  1,  10240,   1, 1 },
+    {3,  1,  10240,   2, 1 },
+    {3,  1,  51200,   1, 1 },
+    {3,  1,  51200,   2, 1 },
+    {3,  1,  102400,  1, 1 },
+    {3,  1,  102400,  2, 1 },
+    {3,  1,  512000,  1, 1 },
+    {3,  1,  512000,  2, 1 },
+    {3,  1,  1024000, 1, 1 },
+    {3,  1,  1024000, 2, 1 },
+    {6,  3,  1024,    1, 1 },
+    {6,  3,  1024,    2, 1 },
+    {6,  3,  5120,    1, 1 },
+    {6,  3,  5120,    2, 1 },
+    {6,  3,  10240,   1, 1 },
+    {6,  3,  10240,   2, 1 },
+    {6,  3,  51200,   1, 1 },
+    {6,  3,  51200,   2, 1 },
+    {6,  3,  102400,  1, 1 },
+    {6,  3,  102400,  2, 1 },
+    {6,  3,  512000,  1, 1 },
+    {6,  3,  512000,  2, 1 },
+    {6,  3,  1024000, 1, 1 },
+    {6,  3,  1024000, 2, 1 },
+    {10, 6,  1024,    1, 1 },
+    {10, 6,  1024,    2, 1 },
+    {10, 6,  5120,    1, 1 },
+    {10, 6,  5120,    2, 1 },
+    {10, 6,  10240,   1, 1 },
+    {10, 6,  10240,   2, 1 },
+    {10, 6,  51200,   1, 1 },
+    {10, 6,  51200,   2, 1 },
+    {10, 6,  102400,  1, 1 },
+    {10, 6,  102400,  2, 1 },
+    {10, 6,  512000,  1, 1 },
+    {10, 6,  512000,  2, 1 },
+    {10, 6,  1024000, 1, 1 },
+    {10, 6,  1024000, 2, 2 },
+    {15, 12, 1024,    1, 1 },
+    {15, 12, 1024,    2, 1 },
+    {15, 12, 5120,    1, 1 },
+    {15, 12, 5120,    2, 1 },
+    {15, 12, 10240,   1, 1 },
+    {15, 12, 10240,   2, 1 },
+    {15, 12, 51200,   1, 1 },
+    {15, 12, 51200,   2, 5 },
+    {15, 12, 102400,  1, 1 },
+    {15, 12, 102400,  2, 9 },
+    {15, 12, 512000,  1, 1 },
+    {15, 12, 512000,  2, 9 },
+    {15, 12, 1024000, 1, 1 },
+    {15, 12, 1024000, 2, 9 },
+    {21, 16, 1024,    1, 1 },
+    {21, 16, 1024,    2, 1 },
+    {21, 16, 5120,    1, 2 },
+    {21, 16, 5120,    2, 6 },
+    {21, 16, 10240,   1, 5 },
+    {21, 16, 10240,   2, 5 },
+    {21, 16, 51200,   1, 4 },
+    {21, 16, 51200,   2, 4 },
+    {21, 16, 102400,  1, 4 },
+    {21, 16, 102400,  2, 8 },
+    {21, 16, 512000,  1, 4 },
+    {21, 16, 512000,  2, 8 },
+    {21, 16, 1024000, 1, 4 },
+    {21, 16, 1024000, 2, 8 },
+    {28, 25, 1024,    1, 1 },
+    {28, 25, 1024,    2, 1 },
+    {28, 25, 5120,    1, 3 },
+    {28, 25, 5120,    2, 4 },
+    {28, 25, 10240,   1, 8 },
+    {28, 25, 10240,   2, 5 },
+    {28, 25, 51200,   1, 3 },
+    {28, 25, 51200,   2, 9 },
+    {28, 25, 102400,  1, 9 },
+    {28, 25, 102400,  2, 9 },
+    {28, 25, 512000,  1, 9 },
+    {28, 25, 512000,  2, 9 },
+    {28, 25, 1024000, 1, 9 },
+    {28, 25, 1024000, 2, 9 },
+    {36, 33, 1024,    1, 1 },
+    {36, 33, 1024,    2, 1 },
+    {36, 33, 5120,    1, 6 },
+    {36, 33, 5120,    2, 4 },
+    {36, 33, 10240,   1, 4 },
+    {36, 33, 10240,   2, 4 },
+    {36, 33, 51200,   1, 14},
+    {36, 33, 51200,   2, 10},
+    {36, 33, 102400,  1, 14},
+    {36, 33, 102400,  2, 10},
+    {36, 33, 512000,  1, 28},
+    {36, 33, 512000,  2, 18},
+    {36, 33, 1024000, 1, 22},
+    {36, 33, 1024000, 2, 20},
+    {4,  1,  1024,    1, 1 },
+    {4,  1,  1024,    3, 1 },
+    {4,  1,  5120,    1, 1 },
+    {4,  1,  5120,    3, 1 },
+    {4,  1,  10240,   1, 1 },
+    {4,  1,  10240,   3, 2 },
+    {4,  1,  51200,   1, 2 },
+    {4,  1,  51200,   3, 2 },
+    {4,  1,  102400,  1, 2 },
+    {4,  1,  102400,  3, 2 },
+    {4,  1,  512000,  1, 1 },
+    {4,  1,  512000,  3, 2 },
+    {4,  1,  1024000, 1, 1 },
+    {4,  1,  1024000, 3, 2 },
+    {10, 4,  1024,    1, 1 },
+    {10, 4,  1024,    3, 2 },
+    {10, 4,  5120,    1, 1 },
+    {10, 4,  5120,    3, 1 },
+    {10, 4,  10240,   1, 1 },
+    {10, 4,  10240,   3, 1 },
+    {10, 4,  51200,   1, 1 },
+    {10, 4,  51200,   3, 2 },
+    {10, 4,  102400,  1, 1 },
+    {10, 4,  102400,  3, 2 },
+    {10, 4,  512000,  1, 1 },
+    {10, 4,  512000,  3, 2 },
+    {10, 4,  1024000, 1, 1 },
+    {10, 4,  1024000, 3, 2 },
+    {20, 11, 1024,    1, 3 },
+    {20, 11, 1024,    3, 3 },
+    {20, 11, 5120,    1, 3 },
+    {20, 11, 5120,    3, 2 },
+    {20, 11, 10240,   1, 3 },
+    {20, 11, 10240,   3, 3 },
+    {20, 11, 51200,   1, 11},
+    {20, 11, 51200,   3, 5 },
+    {20, 11, 102400,  1, 2 },
+    {20, 11, 102400,  3, 5 },
+    {20, 11, 512000,  1, 2 },
+    {20, 11, 512000,  3, 5 },
+    {20, 11, 1024000, 1, 2 },
+    {20, 11, 1024000, 3, 11},
+    {35, 24, 1024,    1, 1 },
+    {35, 24, 1024,    3, 1 },
+    {35, 24, 5120,    1, 4 },
+    {35, 24, 5120,    3, 5 },
+    {35, 24, 10240,   1, 9 },
+    {35, 24, 10240,   3, 5 },
+    {35, 24, 51200,   1, 3 },
+    {35, 24, 51200,   3, 9 },
+    {35, 24, 102400,  1, 3 },
+    {35, 24, 102400,  3, 9 },
+    {35, 24, 512000,  1, 3 },
+    {35, 24, 512000,  3, 9 },
+    {35, 24, 1024000, 1, 3 },
+    {35, 24, 1024000, 3, 9 }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+std::vector<std::array<int, RECORD_LENGTH_RTC> > drtc_n_v100 = {
+    {3,  1,  1024,    1, 1 },
+    {3,  1,  1024,    2, 1 },
+    {3,  1,  5120,    1, 1 },
+    {3,  1,  5120,    2, 1 },
+    {3,  1,  10240,   1, 1 },
+    {3,  1,  10240,   2, 1 },
+    {3,  1,  51200,   1, 1 },
+    {3,  1,  51200,   2, 1 },
+    {3,  1,  102400,  1, 1 },
+    {3,  1,  102400,  2, 1 },
+    {3,  1,  512000,  1, 1 },
+    {3,  1,  512000,  2, 1 },
+    {3,  1,  1024000, 1, 1 },
+    {3,  1,  1024000, 2, 1 },
+    {6,  3,  1024,    1, 1 },
+    {6,  3,  1024,    2, 1 },
+    {6,  3,  5120,    1, 1 },
+    {6,  3,  5120,    2, 1 },
+    {6,  3,  10240,   1, 1 },
+    {6,  3,  10240,   2, 1 },
+    {6,  3,  51200,   1, 1 },
+    {6,  3,  51200,   2, 1 },
+    {6,  3,  102400,  1, 1 },
+    {6,  3,  102400,  2, 1 },
+    {6,  3,  512000,  1, 1 },
+    {6,  3,  512000,  2, 1 },
+    {6,  3,  1024000, 1, 1 },
+    {6,  3,  1024000, 2, 1 },
+    {10, 6,  1024,    1, 1 },
+    {10, 6,  1024,    2, 1 },
+    {10, 6,  5120,    1, 1 },
+    {10, 6,  5120,    2, 1 },
+    {10, 6,  10240,   1, 1 },
+    {10, 6,  10240,   2, 1 },
+    {10, 6,  51200,   1, 1 },
+    {10, 6,  51200,   2, 2 },
+    {10, 6,  102400,  1, 2 },
+    {10, 6,  102400,  2, 1 },
+    {10, 6,  512000,  1, 1 },
+    {10, 6,  512000,  2, 1 },
+    {10, 6,  1024000, 1, 1 },
+    {10, 6,  1024000, 2, 1 },
+    {15, 12, 1024,    1, 1 },
+    {15, 12, 1024,    2, 1 },
+    {15, 12, 5120,    1, 1 },
+    {15, 12, 5120,    2, 2 },
+    {15, 12, 10240,   1, 2 },
+    {15, 12, 10240,   2, 2 },
+    {15, 12, 51200,   1, 3 },
+    {15, 12, 51200,   2, 13},
+    {15, 12, 102400,  1, 5 },
+    {15, 12, 102400,  2, 5 },
+    {15, 12, 512000,  1, 3 },
+    {15, 12, 512000,  2, 5 },
+    {15, 12, 1024000, 1, 3 },
+    {15, 12, 1024000, 2, 5 },
+    {21, 16, 1024,    1, 1 },
+    {21, 16, 1024,    2, 4 },
+    {21, 16, 5120,    1, 2 },
+    {21, 16, 5120,    2, 2 },
+    {21, 16, 10240,   1, 2 },
+    {21, 16, 10240,   2, 4 },
+    {21, 16, 51200,   1, 5 },
+    {21, 16, 51200,   2, 6 },
+    {21, 16, 102400,  1, 5 },
+    {21, 16, 102400,  2, 5 },
+    {21, 16, 512000,  1, 9 },
+    {21, 16, 512000,  2, 5 },
+    {21, 16, 1024000, 1, 3 },
+    {21, 16, 1024000, 2, 11},
+    {28, 25, 1024,    1, 6 },
+    {28, 25, 1024,    2, 3 },
+    {28, 25, 5120,    1, 5 },
+    {28, 25, 5120,    2, 5 },
+    {28, 25, 10240,   1, 9 },
+    {28, 25, 10240,   2, 9 },
+    {28, 25, 51200,   1, 6 },
+    {28, 25, 51200,   2, 5 },
+    {28, 25, 102400,  1, 6 },
+    {28, 25, 102400,  2, 5 },
+    {28, 25, 512000,  1, 6 },
+    {28, 25, 512000,  2, 15},
+    {28, 25, 1024000, 1, 6 },
+    {28, 25, 1024000, 2, 9 },
+    {36, 33, 1024,    1, 5 },
+    {36, 33, 1024,    2, 6 },
+    {36, 33, 5120,    1, 5 },
+    {36, 33, 5120,    2, 11},
+    {36, 33, 10240,   1, 9 },
+    {36, 33, 10240,   2, 23},
+    {36, 33, 51200,   1, 9 },
+    {36, 33, 51200,   2, 13},
+    {36, 33, 102400,  1, 9 },
+    {36, 33, 102400,  2, 13},
+    {36, 33, 512000,  1, 9 },
+    {36, 33, 512000,  2, 13},
+    {36, 33, 1024000, 1, 9 },
+    {36, 33, 1024000, 2, 13},
+    {4,  1,  1024,    1, 1 },
+    {4,  1,  1024,    3, 1 },
+    {4,  1,  5120,    1, 1 },
+    {4,  1,  5120,    3, 1 },
+    {4,  1,  10240,   1, 1 },
+    {4,  1,  10240,   3, 1 },
+    {4,  1,  51200,   1, 1 },
+    {4,  1,  51200,   3, 1 },
+    {4,  1,  102400,  1, 1 },
+    {4,  1,  102400,  3, 1 },
+    {4,  1,  512000,  1, 1 },
+    {4,  1,  512000,  3, 1 },
+    {4,  1,  1024000, 1, 1 },
+    {4,  1,  1024000, 3, 1 },
+    {10, 4,  1024,    1, 1 },
+    {10, 4,  1024,    3, 1 },
+    {10, 4,  5120,    1, 1 },
+    {10, 4,  5120,    3, 1 },
+    {10, 4,  10240,   1, 1 },
+    {10, 4,  10240,   3, 1 },
+    {10, 4,  51200,   1, 1 },
+    {10, 4,  51200,   3, 5 },
+    {10, 4,  102400,  1, 1 },
+    {10, 4,  102400,  3, 1 },
+    {10, 4,  512000,  1, 1 },
+    {10, 4,  512000,  3, 1 },
+    {10, 4,  1024000, 1, 1 },
+    {10, 4,  1024000, 3, 1 },
+    {20, 11, 1024,    1, 3 },
+    {20, 11, 1024,    3, 3 },
+    {20, 11, 5120,    1, 3 },
+    {20, 11, 5120,    3, 6 },
+    {20, 11, 10240,   1, 3 },
+    {20, 11, 10240,   3, 3 },
+    {20, 11, 51200,   1, 6 },
+    {20, 11, 51200,   3, 5 },
+    {20, 11, 102400,  1, 5 },
+    {20, 11, 102400,  3, 5 },
+    {20, 11, 512000,  1, 10},
+    {20, 11, 512000,  3, 7 },
+    {20, 11, 1024000, 1, 6 },
+    {20, 11, 1024000, 3, 7 },
+    {35, 24, 1024,    1, 1 },
+    {35, 24, 1024,    3, 6 },
+    {35, 24, 5120,    1, 5 },
+    {35, 24, 5120,    3, 5 },
+    {35, 24, 10240,   1, 6 },
+    {35, 24, 10240,   3, 9 },
+    {35, 24, 51200,   1, 5 },
+    {35, 24, 51200,   3, 6 },
+    {35, 24, 102400,  1, 5 },
+    {35, 24, 102400,  3, 10},
+    {35, 24, 512000,  1, 5 },
+    {35, 24, 512000,  3, 8 },
+    {35, 24, 1024000, 1, 5 },
+    {35, 24, 1024000, 3, 8 }
+};
diff --git a/backends/magma/tuning/v100_rtc-new.h b/backends/magma/tuning/v100_rtc-new.h
new file mode 100644
index 0000000000..f76b8e10ce
--- /dev/null
+++ b/backends/magma/tuning/v100_rtc-new.h
@@ -0,0 +1,626 @@
+////////////////////////////////////////////////////////////////////////////////
+// auto-generated from data on v100
+
+////////////////////////////////////////////////////////////////////////////////
+std::vector<std::array<int, RECORD_LENGTH_RTC> > drtc_t_v100 = {
+    {3,  1,  1024,    1, 1 },
+    {3,  1,  1024,    2, 1 },
+    {3,  1,  5120,    1, 1 },
+    {3,  1,  5120,    2, 1 },
+    {3,  1,  10240,   1, 1 },
+    {3,  1,  10240,   2, 1 },
+    {3,  1,  51200,   1, 1 },
+    {3,  1,  51200,   2, 1 },
+    {3,  1,  102400,  1, 1 },
+    {3,  1,  102400,  2, 1 },
+    {3,  1,  512000,  1, 1 },
+    {3,  1,  512000,  2, 1 },
+    {3,  1,  1024000, 1, 1 },
+    {3,  1,  1024000, 2, 1 },
+    {6,  3,  1024,    1, 1 },
+    {6,  3,  1024,    2, 1 },
+    {6,  3,  5120,    1, 1 },
+    {6,  3,  5120,    2, 1 },
+    {6,  3,  10240,   1, 1 },
+    {6,  3,  10240,   2, 1 },
+    {6,  3,  51200,   1, 1 },
+    {6,  3,  51200,   2, 1 },
+    {6,  3,  102400,  1, 1 },
+    {6,  3,  102400,  2, 1 },
+    {6,  3,  512000,  1, 1 },
+    {6,  3,  512000,  2, 1 },
+    {6,  3,  1024000, 1, 1 },
+    {6,  3,  1024000, 2, 1 },
+    {10, 6,  1024,    1, 1 },
+    {10, 6,  1024,    2, 1 },
+    {10, 6,  5120,    1, 1 },
+    {10, 6,  5120,    2, 1 },
+    {10, 6,  10240,   1, 1 },
+    {10, 6,  10240,   2, 1 },
+    {10, 6,  51200,   1, 1 },
+    {10, 6,  51200,   2, 1 },
+    {10, 6,  102400,  1, 1 },
+    {10, 6,  102400,  2, 1 },
+    {10, 6,  512000,  1, 1 },
+    {10, 6,  512000,  2, 2 },
+    {10, 6,  1024000, 1, 1 },
+    {10, 6,  1024000, 2, 2 },
+    {15, 12, 1024,    1, 1 },
+    {15, 12, 1024,    2, 1 },
+    {15, 12, 5120,    1, 1 },
+    {15, 12, 5120,    2, 1 },
+    {15, 12, 10240,   1, 1 },
+    {15, 12, 10240,   2, 1 },
+    {15, 12, 51200,   1, 1 },
+    {15, 12, 51200,   2, 5 },
+    {15, 12, 102400,  1, 1 },
+    {15, 12, 102400,  2, 9 },
+    {15, 12, 512000,  1, 1 },
+    {15, 12, 512000,  2, 9 },
+    {15, 12, 1024000, 1, 1 },
+    {15, 12, 1024000, 2, 9 },
+    {21, 16, 1024,    1, 1 },
+    {21, 16, 1024,    2, 1 },
+    {21, 16, 5120,    1, 1 },
+    {21, 16, 5120,    2, 3 },
+    {21, 16, 10240,   1, 2 },
+    {21, 16, 10240,   2, 5 },
+    {21, 16, 51200,   1, 4 },
+    {21, 16, 51200,   2, 4 },
+    {21, 16, 102400,  1, 4 },
+    {21, 16, 102400,  2, 8 },
+    {21, 16, 512000,  1, 4 },
+    {21, 16, 512000,  2, 8 },
+    {21, 16, 1024000, 1, 4 },
+    {21, 16, 1024000, 2, 4 },
+    {28, 25, 1024,    1, 1 },
+    {28, 25, 1024,    2, 1 },
+    {28, 25, 5120,    1, 3 },
+    {28, 25, 5120,    2, 4 },
+    {28, 25, 10240,   1, 7 },
+    {28, 25, 10240,   2, 8 },
+    {28, 25, 51200,   1, 3 },
+    {28, 25, 51200,   2, 4 },
+    {28, 25, 102400,  1, 9 },
+    {28, 25, 102400,  2, 9 },
+    {28, 25, 512000,  1, 9 },
+    {28, 25, 512000,  2, 9 },
+    {28, 25, 1024000, 1, 9 },
+    {28, 25, 1024000, 2, 9 },
+    {36, 33, 1024,    1, 1 },
+    {36, 33, 1024,    2, 1 },
+    {36, 33, 5120,    1, 6 },
+    {36, 33, 5120,    2, 6 },
+    {36, 33, 10240,   1, 4 },
+    {36, 33, 10240,   2, 4 },
+    {36, 33, 51200,   1, 7 },
+    {36, 33, 51200,   2, 18},
+    {36, 33, 102400,  1, 20},
+    {36, 33, 102400,  2, 18},
+    {36, 33, 512000,  1, 22},
+    {36, 33, 512000,  2, 17},
+    {36, 33, 1024000, 1, 28},
+    {36, 33, 1024000, 2, 17},
+    {3,  3,  1024,    1, 1 },
+    {3,  3,  1024,    2, 1 },
+    {3,  3,  5120,    1, 1 },
+    {3,  3,  5120,    2, 1 },
+    {3,  3,  10240,   1, 1 },
+    {3,  3,  10240,   2, 1 },
+    {3,  3,  51200,   1, 1 },
+    {3,  3,  51200,   2, 1 },
+    {3,  3,  102400,  1, 1 },
+    {3,  3,  102400,  2, 1 },
+    {3,  3,  512000,  1, 1 },
+    {3,  3,  512000,  2, 1 },
+    {3,  3,  1024000, 1, 1 },
+    {3,  3,  1024000, 2, 1 },
+    {6,  6,  1024,    1, 1 },
+    {6,  6,  1024,    2, 1 },
+    {6,  6,  5120,    1, 1 },
+    {6,  6,  5120,    2, 1 },
+    {6,  6,  10240,   1, 1 },
+    {6,  6,  10240,   2, 1 },
+    {6,  6,  51200,   1, 1 },
+    {6,  6,  51200,   2, 1 },
+    {6,  6,  102400,  1, 1 },
+    {6,  6,  102400,  2, 1 },
+    {6,  6,  512000,  1, 1 },
+    {6,  6,  512000,  2, 1 },
+    {6,  6,  1024000, 1, 1 },
+    {6,  6,  1024000, 2, 1 },
+    {10, 12, 1024,    1, 3 },
+    {10, 12, 1024,    2, 5 },
+    {10, 12, 5120,    1, 1 },
+    {10, 12, 5120,    2, 3 },
+    {10, 12, 10240,   1, 3 },
+    {10, 12, 10240,   2, 3 },
+    {10, 12, 51200,   1, 1 },
+    {10, 12, 51200,   2, 6 },
+    {10, 12, 102400,  1, 1 },
+    {10, 12, 102400,  2, 6 },
+    {10, 12, 512000,  1, 1 },
+    {10, 12, 512000,  2, 6 },
+    {10, 12, 1024000, 1, 1 },
+    {10, 12, 1024000, 2, 6 },
+    {15, 16, 1024,    1, 3 },
+    {15, 16, 1024,    2, 3 },
+    {15, 16, 5120,    1, 3 },
+    {15, 16, 5120,    2, 3 },
+    {15, 16, 10240,   1, 3 },
+    {15, 16, 10240,   2, 3 },
+    {15, 16, 51200,   1, 3 },
+    {15, 16, 51200,   2, 3 },
+    {15, 16, 102400,  1, 3 },
+    {15, 16, 102400,  2, 3 },
+    {15, 16, 512000,  1, 3 },
+    {15, 16, 512000,  2, 16},
+    {15, 16, 1024000, 1, 3 },
+    {15, 16, 1024000, 2, 16},
+    {21, 25, 1024,    1, 1 },
+    {21, 25, 1024,    2, 1 },
+    {21, 25, 5120,    1, 3 },
+    {21, 25, 5120,    2, 3 },
+    {21, 25, 10240,   1, 5 },
+    {21, 25, 10240,   2, 6 },
+    {21, 25, 51200,   1, 6 },
+    {21, 25, 51200,   2, 6 },
+    {21, 25, 102400,  1, 6 },
+    {21, 25, 102400,  2, 6 },
+    {21, 25, 512000,  1, 6 },
+    {21, 25, 512000,  2, 6 },
+    {21, 25, 1024000, 1, 6 },
+    {21, 25, 1024000, 2, 6 },
+    {28, 33, 1024,    1, 1 },
+    {28, 33, 1024,    2, 1 },
+    {28, 33, 5120,    1, 6 },
+    {28, 33, 5120,    2, 4 },
+    {28, 33, 10240,   1, 4 },
+    {28, 33, 10240,   2, 4 },
+    {28, 33, 51200,   1, 6 },
+    {28, 33, 51200,   2, 6 },
+    {28, 33, 102400,  1, 6 },
+    {28, 33, 102400,  2, 6 },
+    {28, 33, 512000,  1, 6 },
+    {28, 33, 512000,  2, 6 },
+    {28, 33, 1024000, 1, 6 },
+    {28, 33, 1024000, 2, 6 },
+    {4,  1,  1024,    1, 1 },
+    {4,  1,  1024,    3, 1 },
+    {4,  1,  5120,    1, 1 },
+    {4,  1,  5120,    3, 1 },
+    {4,  1,  10240,   1, 1 },
+    {4,  1,  10240,   3, 1 },
+    {4,  1,  51200,   1, 4 },
+    {4,  1,  51200,   3, 3 },
+    {4,  1,  102400,  1, 5 },
+    {4,  1,  102400,  3, 3 },
+    {4,  1,  512000,  1, 1 },
+    {4,  1,  512000,  3, 4 },
+    {4,  1,  1024000, 1, 1 },
+    {4,  1,  1024000, 3, 2 },
+    {10, 4,  1024,    1, 1 },
+    {10, 4,  1024,    3, 1 },
+    {10, 4,  5120,    1, 1 },
+    {10, 4,  5120,    3, 1 },
+    {10, 4,  10240,   1, 1 },
+    {10, 4,  10240,   3, 1 },
+    {10, 4,  51200,   1, 1 },
+    {10, 4,  51200,   3, 2 },
+    {10, 4,  102400,  1, 1 },
+    {10, 4,  102400,  3, 2 },
+    {10, 4,  512000,  1, 1 },
+    {10, 4,  512000,  3, 2 },
+    {10, 4,  1024000, 1, 1 },
+    {10, 4,  1024000, 3, 2 },
+    {20, 11, 1024,    1, 1 },
+    {20, 11, 1024,    3, 1 },
+    {20, 11, 5120,    1, 1 },
+    {20, 11, 5120,    3, 2 },
+    {20, 11, 10240,   1, 3 },
+    {20, 11, 10240,   3, 3 },
+    {20, 11, 51200,   1, 7 },
+    {20, 11, 51200,   3, 5 },
+    {20, 11, 102400,  1, 2 },
+    {20, 11, 102400,  3, 5 },
+    {20, 11, 512000,  1, 2 },
+    {20, 11, 512000,  3, 11},
+    {20, 11, 1024000, 1, 2 },
+    {20, 11, 1024000, 3, 11},
+    {20, 14, 1024,    1, 1 },
+    {20, 14, 1024,    3, 1 },
+    {20, 14, 5120,    1, 1 },
+    {20, 14, 5120,    3, 6 },
+    {20, 14, 10240,   1, 3 },
+    {20, 14, 10240,   3, 6 },
+    {20, 14, 51200,   1, 3 },
+    {20, 14, 51200,   3, 3 },
+    {20, 14, 102400,  1, 3 },
+    {20, 14, 102400,  3, 3 },
+    {20, 14, 512000,  1, 3 },
+    {20, 14, 512000,  3, 13},
+    {20, 14, 1024000, 1, 3 },
+    {20, 14, 1024000, 3, 13},
+    {35, 24, 1024,    1, 1 },
+    {35, 24, 1024,    3, 1 },
+    {35, 24, 5120,    1, 3 },
+    {35, 24, 5120,    3, 9 },
+    {35, 24, 10240,   1, 9 },
+    {35, 24, 10240,   3, 15},
+    {35, 24, 51200,   1, 3 },
+    {35, 24, 51200,   3, 9 },
+    {35, 24, 102400,  1, 3 },
+    {35, 24, 102400,  3, 9 },
+    {35, 24, 512000,  1, 3 },
+    {35, 24, 512000,  3, 22},
+    {35, 24, 1024000, 1, 3 },
+    {35, 24, 1024000, 3, 22},
+    {4,  4,  1024,    1, 1 },
+    {4,  4,  1024,    3, 1 },
+    {4,  4,  5120,    1, 1 },
+    {4,  4,  5120,    3, 1 },
+    {4,  4,  10240,   1, 1 },
+    {4,  4,  10240,   3, 1 },
+    {4,  4,  51200,   1, 1 },
+    {4,  4,  51200,   3, 1 },
+    {4,  4,  102400,  1, 1 },
+    {4,  4,  102400,  3, 1 },
+    {4,  4,  512000,  1, 1 },
+    {4,  4,  512000,  3, 1 },
+    {4,  4,  1024000, 1, 1 },
+    {4,  4,  1024000, 3, 1 },
+    {10, 11, 1024,    1, 1 },
+    {10, 11, 1024,    3, 1 },
+    {10, 11, 5120,    1, 1 },
+    {10, 11, 5120,    3, 1 },
+    {10, 11, 10240,   1, 1 },
+    {10, 11, 10240,   3, 3 },
+    {10, 11, 51200,   1, 1 },
+    {10, 11, 51200,   3, 2 },
+    {10, 11, 102400,  1, 1 },
+    {10, 11, 102400,  3, 2 },
+    {10, 11, 512000,  1, 1 },
+    {10, 11, 512000,  3, 2 },
+    {10, 11, 1024000, 1, 1 },
+    {10, 11, 1024000, 3, 2 },
+    {10, 14, 1024,    1, 1 },
+    {10, 14, 1024,    3, 1 },
+    {10, 14, 5120,    1, 1 },
+    {10, 14, 5120,    3, 1 },
+    {10, 14, 10240,   1, 1 },
+    {10, 14, 10240,   3, 2 },
+    {10, 14, 51200,   1, 3 },
+    {10, 14, 51200,   3, 11},
+    {10, 14, 102400,  1, 3 },
+    {10, 14, 102400,  3, 3 },
+    {10, 14, 512000,  1, 1 },
+    {10, 14, 512000,  3, 3 },
+    {10, 14, 1024000, 1, 1 },
+    {10, 14, 1024000, 3, 3 },
+    {20, 24, 1024,    1, 1 },
+    {20, 24, 1024,    3, 1 },
+    {20, 24, 5120,    1, 2 },
+    {20, 24, 5120,    3, 3 },
+    {20, 24, 10240,   1, 5 },
+    {20, 24, 10240,   3, 5 },
+    {20, 24, 51200,   1, 3 },
+    {20, 24, 51200,   3, 19},
+    {20, 24, 102400,  1, 15},
+    {20, 24, 102400,  3, 19},
+    {20, 24, 512000,  1, 9 },
+    {20, 24, 512000,  3, 19},
+    {20, 24, 1024000, 1, 9 },
+    {20, 24, 1024000, 3, 19}
+};
+
+////////////////////////////////////////////////////////////////////////////////
+std::vector<std::array<int, RECORD_LENGTH_RTC> > drtc_n_v100 = {
+    {3,  1,  1024,    1, 1 },
+    {3,  1,  1024,    2, 1 },
+    {3,  1,  5120,    1, 1 },
+    {3,  1,  5120,    2, 1 },
+    {3,  1,  10240,   1, 1 },
+    {3,  1,  10240,   2, 1 },
+    {3,  1,  51200,   1, 1 },
+    {3,  1,  51200,   2, 1 },
+    {3,  1,  102400,  1, 1 },
+    {3,  1,  102400,  2, 1 },
+    {3,  1,  512000,  1, 1 },
+    {3,  1,  512000,  2, 1 },
+    {3,  1,  1024000, 1, 1 },
+    {3,  1,  1024000, 2, 1 },
+    {6,  3,  1024,    1, 1 },
+    {6,  3,  1024,    2, 1 },
+    {6,  3,  5120,    1, 1 },
+    {6,  3,  5120,    2, 1 },
+    {6,  3,  10240,   1, 1 },
+    {6,  3,  10240,   2, 1 },
+    {6,  3,  51200,   1, 1 },
+    {6,  3,  51200,   2, 1 },
+    {6,  3,  102400,  1, 1 },
+    {6,  3,  102400,  2, 1 },
+    {6,  3,  512000,  1, 1 },
+    {6,  3,  512000,  2, 1 },
+    {6,  3,  1024000, 1, 1 },
+    {6,  3,  1024000, 2, 1 },
+    {10, 6,  1024,    1, 1 },
+    {10, 6,  1024,    2, 1 },
+    {10, 6,  5120,    1, 1 },
+    {10, 6,  5120,    2, 1 },
+    {10, 6,  10240,   1, 1 },
+    {10, 6,  10240,   2, 1 },
+    {10, 6,  51200,   1, 1 },
+    {10, 6,  51200,   2, 1 },
+    {10, 6,  102400,  1, 2 },
+    {10, 6,  102400,  2, 1 },
+    {10, 6,  512000,  1, 1 },
+    {10, 6,  512000,  2, 1 },
+    {10, 6,  1024000, 1, 1 },
+    {10, 6,  1024000, 2, 1 },
+    {15, 12, 1024,    1, 1 },
+    {15, 12, 1024,    2, 2 },
+    {15, 12, 5120,    1, 3 },
+    {15, 12, 5120,    2, 1 },
+    {15, 12, 10240,   1, 4 },
+    {15, 12, 10240,   2, 2 },
+    {15, 12, 51200,   1, 3 },
+    {15, 12, 51200,   2, 11},
+    {15, 12, 102400,  1, 3 },
+    {15, 12, 102400,  2, 3 },
+    {15, 12, 512000,  1, 9 },
+    {15, 12, 512000,  2, 5 },
+    {15, 12, 1024000, 1, 9 },
+    {15, 12, 1024000, 2, 5 },
+    {21, 16, 1024,    1, 1 },
+    {21, 16, 1024,    2, 1 },
+    {21, 16, 5120,    1, 5 },
+    {21, 16, 5120,    2, 2 },
+    {21, 16, 10240,   1, 4 },
+    {21, 16, 10240,   2, 4 },
+    {21, 16, 51200,   1, 5 },
+    {21, 16, 51200,   2, 5 },
+    {21, 16, 102400,  1, 9 },
+    {21, 16, 102400,  2, 5 },
+    {21, 16, 512000,  1, 9 },
+    {21, 16, 512000,  2, 7 },
+    {21, 16, 1024000, 1, 5 },
+    {21, 16, 1024000, 2, 7 },
+    {28, 25, 1024,    1, 1 },
+    {28, 25, 1024,    2, 5 },
+    {28, 25, 5120,    1, 5 },
+    {28, 25, 5120,    2, 5 },
+    {28, 25, 10240,   1, 7 },
+    {28, 25, 10240,   2, 5 },
+    {28, 25, 51200,   1, 6 },
+    {28, 25, 51200,   2, 5 },
+    {28, 25, 102400,  1, 6 },
+    {28, 25, 102400,  2, 10},
+    {28, 25, 512000,  1, 6 },
+    {28, 25, 512000,  2, 14},
+    {28, 25, 1024000, 1, 16},
+    {28, 25, 1024000, 2, 9 },
+    {36, 33, 1024,    1, 6 },
+    {36, 33, 1024,    2, 5 },
+    {36, 33, 5120,    1, 11},
+    {36, 33, 5120,    2, 8 },
+    {36, 33, 10240,   1, 6 },
+    {36, 33, 10240,   2, 9 },
+    {36, 33, 51200,   1, 9 },
+    {36, 33, 51200,   2, 9 },
+    {36, 33, 102400,  1, 9 },
+    {36, 33, 102400,  2, 10},
+    {36, 33, 512000,  1, 9 },
+    {36, 33, 512000,  2, 13},
+    {36, 33, 1024000, 1, 9 },
+    {36, 33, 1024000, 2, 13},
+    {3,  3,  1024,    1, 1 },
+    {3,  3,  1024,    2, 1 },
+    {3,  3,  5120,    1, 1 },
+    {3,  3,  5120,    2, 1 },
+    {3,  3,  10240,   1, 1 },
+    {3,  3,  10240,   2, 1 },
+    {3,  3,  51200,   1, 1 },
+    {3,  3,  51200,   2, 1 },
+    {3,  3,  102400,  1, 1 },
+    {3,  3,  102400,  2, 1 },
+    {3,  3,  512000,  1, 1 },
+    {3,  3,  512000,  2, 1 },
+    {3,  3,  1024000, 1, 1 },
+    {3,  3,  1024000, 2, 1 },
+    {6,  6,  1024,    1, 1 },
+    {6,  6,  1024,    2, 1 },
+    {6,  6,  5120,    1, 1 },
+    {6,  6,  5120,    2, 1 },
+    {6,  6,  10240,   1, 1 },
+    {6,  6,  10240,   2, 1 },
+    {6,  6,  51200,   1, 1 },
+    {6,  6,  51200,   2, 1 },
+    {6,  6,  102400,  1, 1 },
+    {6,  6,  102400,  2, 1 },
+    {6,  6,  512000,  1, 1 },
+    {6,  6,  512000,  2, 1 },
+    {6,  6,  1024000, 1, 1 },
+    {6,  6,  1024000, 2, 1 },
+    {10, 12, 1024,    1, 3 },
+    {10, 12, 1024,    2, 3 },
+    {10, 12, 5120,    1, 3 },
+    {10, 12, 5120,    2, 3 },
+    {10, 12, 10240,   1, 5 },
+    {10, 12, 10240,   2, 3 },
+    {10, 12, 51200,   1, 5 },
+    {10, 12, 51200,   2, 3 },
+    {10, 12, 102400,  1, 5 },
+    {10, 12, 102400,  2, 3 },
+    {10, 12, 512000,  1, 5 },
+    {10, 12, 512000,  2, 2 },
+    {10, 12, 1024000, 1, 5 },
+    {10, 12, 1024000, 2, 2 },
+    {15, 16, 1024,    1, 3 },
+    {15, 16, 1024,    2, 3 },
+    {15, 16, 5120,    1, 3 },
+    {15, 16, 5120,    2, 3 },
+    {15, 16, 10240,   1, 4 },
+    {15, 16, 10240,   2, 6 },
+    {15, 16, 51200,   1, 7 },
+    {15, 16, 51200,   2, 3 },
+    {15, 16, 102400,  1, 5 },
+    {15, 16, 102400,  2, 3 },
+    {15, 16, 512000,  1, 5 },
+    {15, 16, 512000,  2, 3 },
+    {15, 16, 1024000, 1, 5 },
+    {15, 16, 1024000, 2, 4 },
+    {21, 25, 1024,    1, 3 },
+    {21, 25, 1024,    2, 1 },
+    {21, 25, 5120,    1, 5 },
+    {21, 25, 5120,    2, 13},
+    {21, 25, 10240,   1, 7 },
+    {21, 25, 10240,   2, 7 },
+    {21, 25, 51200,   1, 5 },
+    {21, 25, 51200,   2, 4 },
+    {21, 25, 102400,  1, 6 },
+    {21, 25, 102400,  2, 4 },
+    {21, 25, 512000,  1, 6 },
+    {21, 25, 512000,  2, 4 },
+    {21, 25, 1024000, 1, 6 },
+    {21, 25, 1024000, 2, 4 },
+    {28, 33, 1024,    1, 2 },
+    {28, 33, 1024,    2, 5 },
+    {28, 33, 5120,    1, 11},
+    {28, 33, 5120,    2, 8 },
+    {28, 33, 10240,   1, 9 },
+    {28, 33, 10240,   2, 9 },
+    {28, 33, 51200,   1, 6 },
+    {28, 33, 51200,   2, 9 },
+    {28, 33, 102400,  1, 6 },
+    {28, 33, 102400,  2, 9 },
+    {28, 33, 512000,  1, 6 },
+    {28, 33, 512000,  2, 15},
+    {28, 33, 1024000, 1, 6 },
+    {28, 33, 1024000, 2, 15},
+    {4,  1,  1024,    1, 1 },
+    {4,  1,  1024,    3, 1 },
+    {4,  1,  5120,    1, 1 },
+    {4,  1,  5120,    3, 1 },
+    {4,  1,  10240,   1, 1 },
+    {4,  1,  10240,   3, 1 },
+    {4,  1,  51200,   1, 1 },
+    {4,  1,  51200,   3, 1 },
+    {4,  1,  102400,  1, 1 },
+    {4,  1,  102400,  3, 1 },
+    {4,  1,  512000,  1, 1 },
+    {4,  1,  512000,  3, 1 },
+    {4,  1,  1024000, 1, 1 },
+    {4,  1,  1024000, 3, 1 },
+    {10, 4,  1024,    1, 1 },
+    {10, 4,  1024,    3, 1 },
+    {10, 4,  5120,    1, 1 },
+    {10, 4,  5120,    3, 1 },
+    {10, 4,  10240,   1, 1 },
+    {10, 4,  10240,   3, 1 },
+    {10, 4,  51200,   1, 1 },
+    {10, 4,  51200,   3, 5 },
+    {10, 4,  102400,  1, 1 },
+    {10, 4,  102400,  3, 1 },
+    {10, 4,  512000,  1, 1 },
+    {10, 4,  512000,  3, 1 },
+    {10, 4,  1024000, 1, 1 },
+    {10, 4,  1024000, 3, 1 },
+    {20, 11, 1024,    1, 1 },
+    {20, 11, 1024,    3, 1 },
+    {20, 11, 5120,    1, 6 },
+    {20, 11, 5120,    3, 2 },
+    {20, 11, 10240,   1, 6 },
+    {20, 11, 10240,   3, 3 },
+    {20, 11, 51200,   1, 6 },
+    {20, 11, 51200,   3, 5 },
+    {20, 11, 102400,  1, 6 },
+    {20, 11, 102400,  3, 5 },
+    {20, 11, 512000,  1, 5 },
+    {20, 11, 512000,  3, 6 },
+    {20, 11, 1024000, 1, 6 },
+    {20, 11, 1024000, 3, 6 },
+    {20, 14, 1024,    1, 1 },
+    {20, 14, 1024,    3, 2 },
+    {20, 14, 5120,    1, 8 },
+    {20, 14, 5120,    3, 2 },
+    {20, 14, 10240,   1, 5 },
+    {20, 14, 10240,   3, 5 },
+    {20, 14, 51200,   1, 5 },
+    {20, 14, 51200,   3, 5 },
+    {20, 14, 102400,  1, 5 },
+    {20, 14, 102400,  3, 5 },
+    {20, 14, 512000,  1, 5 },
+    {20, 14, 512000,  3, 5 },
+    {20, 14, 1024000, 1, 5 },
+    {20, 14, 1024000, 3, 5 },
+    {35, 24, 1024,    1, 1 },
+    {35, 24, 1024,    3, 2 },
+    {35, 24, 5120,    1, 5 },
+    {35, 24, 5120,    3, 5 },
+    {35, 24, 10240,   1, 6 },
+    {35, 24, 10240,   3, 9 },
+    {35, 24, 51200,   1, 5 },
+    {35, 24, 51200,   3, 6 },
+    {35, 24, 102400,  1, 5 },
+    {35, 24, 102400,  3, 14},
+    {35, 24, 512000,  1, 18},
+    {35, 24, 512000,  3, 9 },
+    {35, 24, 1024000, 1, 18},
+    {35, 24, 1024000, 3, 9 },
+    {4,  4,  1024,    1, 1 },
+    {4,  4,  1024,    3, 1 },
+    {4,  4,  5120,    1, 1 },
+    {4,  4,  5120,    3, 1 },
+    {4,  4,  10240,   1, 1 },
+    {4,  4,  10240,   3, 1 },
+    {4,  4,  51200,   1, 1 },
+    {4,  4,  51200,   3, 1 },
+    {4,  4,  102400,  1, 5 },
+    {4,  4,  102400,  3, 3 },
+    {4,  4,  512000,  1, 1 },
+    {4,  4,  512000,  3, 1 },
+    {4,  4,  1024000, 1, 1 },
+    {4,  4,  1024000, 3, 1 },
+    {10, 11, 1024,    1, 1 },
+    {10, 11, 1024,    3, 1 },
+    {10, 11, 5120,    1, 1 },
+    {10, 11, 5120,    3, 1 },
+    {10, 11, 10240,   1, 2 },
+    {10, 11, 10240,   3, 2 },
+    {10, 11, 51200,   1, 1 },
+    {10, 11, 51200,   3, 1 },
+    {10, 11, 102400,  1, 1 },
+    {10, 11, 102400,  3, 1 },
+    {10, 11, 512000,  1, 1 },
+    {10, 11, 512000,  3, 1 },
+    {10, 11, 1024000, 1, 1 },
+    {10, 11, 1024000, 3, 1 },
+    {10, 14, 1024,    1, 1 },
+    {10, 14, 1024,    3, 1 },
+    {10, 14, 5120,    1, 1 },
+    {10, 14, 5120,    3, 1 },
+    {10, 14, 10240,   1, 2 },
+    {10, 14, 10240,   3, 7 },
+    {10, 14, 51200,   1, 1 },
+    {10, 14, 51200,   3, 1 },
+    {10, 14, 102400,  1, 1 },
+    {10, 14, 102400,  3, 1 },
+    {10, 14, 512000,  1, 1 },
+    {10, 14, 512000,  3, 1 },
+    {10, 14, 1024000, 1, 1 },
+    {10, 14, 1024000, 3, 1 },
+    {20, 24, 1024,    1, 6 },
+    {20, 24, 1024,    3, 3 },
+    {20, 24, 5120,    1, 3 },
+    {20, 24, 5120,    3, 5 },
+    {20, 24, 10240,   1, 7 },
+    {20, 24, 10240,   3, 9 },
+    {20, 24, 51200,   1, 11},
+    {20, 24, 51200,   3, 19},
+    {20, 24, 102400,  1, 10},
+    {20, 24, 102400,  3, 5 },
+    {20, 24, 512000,  1, 14},
+    {20, 24, 512000,  3, 6 },
+    {20, 24, 1024000, 1, 13},
+    {20, 24, 1024000, 3, 5 }
+};
diff --git a/include/ceed/jit-source/magma/grad-1d.h b/include/ceed/jit-source/magma/grad-1d.h
deleted file mode 100644
index 5eea0ee2f9..0000000000
--- a/include/ceed/jit-source/magma/grad-1d.h
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-// macros to abstract access of shared memory and reg. file
-#define sT(i, j) sT[(j)*P_ + (i)]
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// grad basis action (1D)
-template <typename T, int DIM_, int NCOMP_, int P_, int Q_>
-static __device__ __inline__ void magma_grad_1d_device(const T *sT, magma_trans_t transT, T *sU[NCOMP_], T *sV[NCOMP_], const int tx) {
-  // Assumptions
-  // 1. 1D threads of size max(P_,Q_)
-  // 2. sU[i] is 1xP_: in shared memory
-  // 3. sV[i] is 1xQ_: in shared memory
-  // 4. P_roduct per component is one row (1xP_) times T matrix (P_xQ_) => one row (1xQ_)
-  // 5. Each thread computes one entry in sV[i]
-  // 6. Must sync before and after call
-  // 7. Note that the layout for U and V is different from 2D/3D problem
-
-  T rv;
-  if (tx < Q_) {
-    for (int icomp = 0; icomp < NCOMP_; icomp++) {
-      rv = (transT == MagmaTrans) ? sV[icomp][tx] : 0.0;
-      for (int i = 0; i < P_; i++) {
-        rv += sU[icomp][i] * sT(i, tx);
-      }
-      sV[icomp][tx] = rv;
-    }
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_1D)) __global__
-    void magma_gradn_1d_kernel(const CeedScalar *dTinterp, const CeedScalar *dTgrad, const CeedScalar *dU, const int estrdU, const int cstrdU,
-                               const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
-  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
-
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaNoTrans;
-
-  if (elem_id >= nelem) return;
-
-  CeedScalar *sU[NCOMP];
-  CeedScalar *sV[NCOMP];
-
-  // shift global memory pointers by elem stride
-  dU += elem_id * estrdU;
-  dV += elem_id * estrdV;
-
-  // assign shared memory pointers
-  CeedScalar *sT = (CeedScalar *)(shared_data);
-  CeedScalar *sW = sT + P * Q;
-  sU[0]          = sW + ty * NCOMP * (P + Q);
-  sV[0]          = sU[0] + (NCOMP * 1 * P);
-  for (int icomp = 1; icomp < NCOMP; icomp++) {
-    sU[icomp] = sU[icomp - 1] + (1 * P);
-    sV[icomp] = sV[icomp - 1] + (1 * Q);
-  }
-
-  // read T
-  if (ty == 0) {
-    dread_T_gm2sm<P, Q>(tx, transT, dTgrad, sT);
-  }
-
-  // read U
-  read_1d<CeedScalar, P, NCOMP>(dU, cstrdU, sU, tx);
-
-  __syncthreads();
-  magma_grad_1d_device<CeedScalar, DIM, NCOMP, P, Q>(sT, transT, sU, sV, tx);
-  __syncthreads();
-
-  // write V
-  write_1d<CeedScalar, Q, NCOMP>(sV, dV, cstrdV, tx);
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_1D)) __global__
-    void magma_gradt_1d_kernel(const CeedScalar *dTinterp, const CeedScalar *dTgrad, const CeedScalar *dU, const int estrdU, const int cstrdU,
-                               const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
-  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
-
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaTrans;
-
-  if (elem_id >= nelem) return;
-
-  CeedScalar *sU[NCOMP];
-  CeedScalar *sV[NCOMP];
-
-  // shift global memory pointers by elem stride
-  dU += elem_id * estrdU;
-  dV += elem_id * estrdV;
-
-  // assign shared memory pointers
-  CeedScalar *sT = (CeedScalar *)(shared_data);
-  CeedScalar *sW = sT + Q * P;
-  sU[0]          = sW + ty * NCOMP * (Q + P);
-  sV[0]          = sU[0] + (NCOMP * 1 * Q);
-  for (int icomp = 1; icomp < NCOMP; icomp++) {
-    sU[icomp] = sU[icomp - 1] + (1 * Q);
-    sV[icomp] = sV[icomp - 1] + (1 * P);
-  }
-
-  // read T
-  if (ty == 0) {
-    dread_T_gm2sm<Q, P>(tx, transT, dTgrad, sT);
-  }
-
-  // read U
-  read_1d<CeedScalar, Q, NCOMP>(dU, cstrdU, sU, tx);
-
-  // read V
-  read_1d<CeedScalar, P, NCOMP>(dV, cstrdV, sV, tx);
-
-  __syncthreads();
-  magma_grad_1d_device<CeedScalar, DIM, NCOMP, Q, P>(sT, transT, sU, sV, tx);
-  __syncthreads();
-
-  // write V
-  write_1d<CeedScalar, P, NCOMP>(sV, dV, cstrdV, tx);
-}
diff --git a/include/ceed/jit-source/magma/grad-2d.h b/include/ceed/jit-source/magma/grad-2d.h
deleted file mode 100644
index 1f2763ac9f..0000000000
--- a/include/ceed/jit-source/magma/grad-2d.h
+++ /dev/null
@@ -1,180 +0,0 @@
-// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-// macros to abstract access of shared memory and reg. file
-#define sT(i, j) sT[(j)*P_ + (i)]
-#define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)]
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// grad basis action (2D)
-// This function is called two times at a higher level for 2D
-// DIM_U  -- for the size of rU[DIM_U * NCOMP_ * MAXP_Q_]
-// DIM_V  -- for the size of rV[DIM_V * NCOMP_ * MAXP_Q_]
-// iDIM_  -- the index of the outermost loop over dimensions in grad
-// iDIM_U -- which dim index of rU is accessed (always 0 for notrans, 0 or 1 for trans)
-// iDIM_V -- which dim index of rV is accessed (0 or 1 for notrans, always 0 for trans)
-// the scalar beta is used to specify whether to accumulate to rV, or overwrite it
-template <typename T, int DIM_U, int DIM_V, int NCOMP_, int P_, int Q_, int rUsize, int rVsize, int iDIM_, int iDIM_U, int iDIM_V>
-static __device__ __inline__ void magma_grad_2d_device(const T *sTinterp, const T *sTgrad, T rU[DIM_U][NCOMP_][rUsize], T rV[DIM_V][NCOMP_][rVsize],
-                                                       T beta, const int tx, T rTmp, T *swork) {
-  // Assumptions
-  // 0. This device routine applies grad for one dim only (iDIM_), so it should be called twice for 2D
-  // 1. 1D threads of size max(P_,Q_)
-  // 2. input:  rU[DIM_U x NCOMP_ x P_] in registers (per thread)
-  // 3. output: rV[DIM_V x NCOMP_ x Q_] in registers (per thread)
-  // 4. Two products per each (dim,component) pair
-  //  4.1 Batch P_ of (1xP_) matrices times (P_xQ_) matrix => Batch P_ of (1xQ_) matrices
-  //  4.2 Batch 1 of (Q_xP_) matrix   times (P_xQ_) matrix => (Q_xQ_) matrix
-  // 6. Each thread computes one row of the output of each product
-  // 7. Sync is recommended before and after the call
-
-  for (int icomp = 0; icomp < NCOMP_; icomp++) {
-    // 1st product -- Batch P_ of (1xP_) matrices [reg] x (P_xQ_) [shmem] => Batch P_ of (1xQ_) matrices
-    // the batch output P_ x (1xQ_) is written on the fly to shmem
-    if (tx < P_) {
-      const int batchid = tx;
-      const int sld     = 1;
-      const T  *sT      = (iDIM_ == 0) ? sTgrad : sTinterp;
-      T        *sTmp    = swork + batchid * (1 * Q_);
-      for (int j = 0; j < Q_; j++) {
-        rTmp = 0.0;
-        for (int i = 0; i < P_; i++) {
-          rTmp += rU[iDIM_U][icomp][i] * sT(i, j);
-        }
-        sTmp(0, j, sld) = rTmp;
-      }
-    }  // end of: if (tx < P_)
-    __syncthreads();
-
-    // 2nd product -- Batch 1 of a (Q_xP_) matrix [shmem] x (P_xQ_) [shmem] => (Q_xQ_) matrix [reg]
-    if (tx < Q_) {
-      const int batchid = 0;
-      const int sld     = Q_;
-      const T  *sT      = (iDIM_ == 1) ? sTgrad : sTinterp;
-      T        *sTmp    = swork + batchid * (Q_ * P_);
-      for (int j = 0; j < Q_; j++) {
-        rTmp = 0.0;
-        for (int i = 0; i < P_; i++) {
-          rTmp += sTmp(tx, i, sld) * sT(i, j);
-        }
-        rV[iDIM_V][icomp][j] *= beta;
-        rV[iDIM_V][icomp][j] += rTmp;
-      }
-    }
-    __syncthreads();
-  }  // loop over NCOMP_
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_2D)) __global__
-    void magma_gradn_2d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU,
-                               const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
-  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
-
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaNoTrans;
-
-  if (elem_id >= nelem) return;
-
-  CeedScalar rU[1][NCOMP][P] = {0.0};  // here DIMU = 1, but might be different for a fused operator
-  CeedScalar rV[1][NCOMP][Q] = {0.0};  // here DIMV = 1, but might be different for a fused operator
-  CeedScalar rTmp            = 0.0;
-
-  // shift global memory pointers by elem stride
-  dU += elem_id * estrdU;
-  dV += elem_id * estrdV;
-
-  // assign shared memory pointers
-  CeedScalar *sTinterp = (CeedScalar *)(shared_data);
-  CeedScalar *sTgrad   = sTinterp + P * Q;
-  CeedScalar *sTmp     = sTgrad + P * Q;
-  sTmp += ty * (P * MAXPQ);
-
-  // read T
-  if (ty == 0) {
-    dread_T_gm2sm<P, Q>(tx, transT, dinterp1d, sTinterp);
-    dread_T_gm2sm<P, Q>(tx, transT, dgrad1d, sTgrad);
-  }
-
-  // No need to read V ( required only in transposed grad )
-  const CeedScalar beta = 0.0;
-
-  /* read U (idim = 0 for dU, iDIM = 0 for rU) --
-     there is a sync at the end of this function */
-  readU_2d<CeedScalar, P, 1, NCOMP, P, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
-
-  /* first call (iDIM = 0, iDIMU = 0, iDIMV = 0) --
-     output from rV[0][][] into dV (idim = 0) */
-  magma_grad_2d_device<CeedScalar, 1, 1, NCOMP, P, Q, P, Q, 0, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
-  /* there is a sync at the end of magma_grad_2d_device */
-  writeV_2d<CeedScalar, Q, 1, NCOMP, Q, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
-
-  /* second call (iDIM = 1, iDIMU = 0, iDIMV = 0) --
-  output from rV[0][][] into dV (idim = 1) */
-  magma_grad_2d_device<CeedScalar, 1, 1, NCOMP, P, Q, P, Q, 1, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
-  /* there is a sync at the end of magma_grad_2d_device */
-  writeV_2d<CeedScalar, Q, 1, NCOMP, Q, 0>(dV + (1 * dstrdV), cstrdV, rV, tx);
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_2D)) __global__
-    void magma_gradt_2d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU,
-                               const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
-  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
-
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaTrans;
-
-  if (elem_id >= nelem) return;
-
-  CeedScalar rU[1][NCOMP][Q] = {0.0};  // here DIMU = 1, but might be different for a fused operator
-  CeedScalar rV[1][NCOMP][P] = {0.0};  // here DIMV = 1, but might be different for a fused operator
-  CeedScalar rTmp            = 0.0;
-
-  // shift global memory pointers by elem stride
-  dU += elem_id * estrdU;
-  dV += elem_id * estrdV;
-
-  // assign shared memory pointers
-  CeedScalar *sTinterp = (CeedScalar *)(shared_data);
-  CeedScalar *sTgrad   = sTinterp + Q * P;
-  CeedScalar *sTmp     = sTgrad + Q * P;
-  sTmp += ty * (Q * MAXPQ);
-
-  // read T
-  if (ty == 0) {
-    dread_T_gm2sm<Q, P>(tx, transT, dinterp1d, sTinterp);
-    dread_T_gm2sm<Q, P>(tx, transT, dgrad1d, sTgrad);
-  }
-  __syncthreads();
-
-  /* read V (since this is transposed mode --
-     idim = 0 for dV, iDIM = 0 for rV) */
-  const CeedScalar beta = 1.0;
-  readV_2d<CeedScalar, P, 1, NCOMP, P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
-
-  /* read U (idim = 0 for dU, iDIM = 0 for rU) --
-     there is a sync at the end of this function */
-  readU_2d<CeedScalar, Q, 1, NCOMP, Q, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
-  /* first call (iDIM = 0, iDIMU = 0, iDIMV = 0) */
-  magma_grad_2d_device<CeedScalar, 1, 1, NCOMP, Q, P, Q, P, 0, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
-  /* there is a sync at the end of magma_grad_2d_device */
-
-  /* read U (idim = 1 for dU, iDIM = 0 for rU) --
-     there is a sync at the end of this function */
-  readU_2d<CeedScalar, Q, 1, NCOMP, Q, 0>(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx);
-  /* second call (iDIM = 1, iDIMU = 0, iDIMV = 0) */
-  magma_grad_2d_device<CeedScalar, 1, 1, NCOMP, Q, P, Q, P, 1, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
-  /* there is a sync at the end of magma_grad_2d_device */
-
-  // write V
-  writeV_2d<CeedScalar, P, 1, NCOMP, P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
-}
diff --git a/include/ceed/jit-source/magma/grad-3d.h b/include/ceed/jit-source/magma/grad-3d.h
deleted file mode 100644
index 072c1da2f8..0000000000
--- a/include/ceed/jit-source/magma/grad-3d.h
+++ /dev/null
@@ -1,215 +0,0 @@
-// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-// macros to abstract access of shared memory and reg. file
-#define sT(i, j) sT[(j)*P_ + (i)]
-#define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)]
-#define sTmp2(i, j, ldw) sTmp2[(j) * (ldw) + (i)]
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// grad basis action (3D)
-// This function is called three times at a higher level for 3D
-// DIM_U  -- for the size of rU[DIM_U * NCOMP_ * MAXP_Q_]
-// DIM_V  -- for the size of rV[DIM_V * NCOMP_ * MAXP_Q_]
-// iDIM_  -- the index of the outermost loop over dimensions in grad
-// iDIM_U -- which dim index of rU is accessed (always 0 for notrans, 0, 1, or 2 for trans)
-// iDIM_V -- which dim index of rV is accessed (0, 1, or 2 for notrans, always 0 for trans)
-// the scalar beta is used to specify whether to accumulate to rV, or overwrite it
-template <typename T, int DIM_U, int DIM_V, int NCOMP_, int P_, int Q_, int rUsize, int rVsize, int iDIM_, int iDIM_U, int iDIM_V>
-static __device__ __inline__ void magma_grad_3d_device(const T *sTinterp, const T *sTgrad, T rU[DIM_U][NCOMP_][rUsize], T rV[DIM_V][NCOMP_][rVsize],
-                                                       T beta, const int tx, T rTmp, T *swork) {
-  // Assumptions
-  // 0. This device routine applies grad for one dim only (iDIM_), so it should be thrice for 3D
-  // 1. 1D threads of size max(P_,Q_)^2
-  // 2. input:  rU[DIM_U x NCOMP_ x rUsize] in registers (per thread)
-  // 3. output: rV[DIM_V x NCOMP_ x rVsize] in registers (per thread)
-  // 4. Three products per each (dim,component) pair
-  //  4.1 Batch P_^2 of (1xP_) matrices times (P_xQ_) matrix => Batch P_^2 of (1xQ_) matrices
-  //  4.2 Batch P_   of (Q_xP_) matrices times (P_xQ_) matrix => Batch P_   of (Q_xQ_) matrices
-  //  4.3 Batch 1   of (Q_^2xP_) matrix times (P_xQ_) matrix => (Q_^2xQ_) matrix
-  // 6. Each thread computes one row of the output of each product
-  // 7. Sync is recommended before and after the call
-
-  T *sW1 = swork;
-  T *sW2 = sW1 + P_ * P_ * Q_;
-  for (int icomp = 0; icomp < NCOMP_; icomp++) {
-    // Batch P_^2 of (1xP_) matrices [reg] times (P_xQ_) matrix [shmem] => Batch P_^2 of (1xQ_) matrices [shmem]
-    if (tx < (P_ * P_)) {
-      const int batchid = tx;
-      const int sld     = 1;
-      const T  *sT      = (iDIM_ == 0) ? sTgrad : sTinterp;
-      T        *sTmp    = sW1 + batchid * (1 * Q_);
-      for (int j = 0; j < Q_; j++) {
-        rTmp = 0.0;
-        for (int i = 0; i < P_; i++) {
-          rTmp += rU[iDIM_U][icomp][i] * sT(i, j);
-        }
-        sTmp(0, j, sld) = rTmp;
-      }
-    }  // end of: if (tx < P_*P_)
-    __syncthreads();
-
-    // Batch P_ of (Q_xP_) matrices [shmem] times (P_xQ_) matrix [shmem] => Batch P_ of (Q_xQ_) matrices [reg]
-    if (tx < (P_ * Q_)) {
-      const int batchid = tx / Q_;
-      const int tx_     = tx % Q_;
-      const int sld     = Q_;
-      const T  *sT      = (iDIM_ == 1) ? sTgrad : sTinterp;
-      T        *sTmp    = sW1 + batchid * (Q_ * P_);  // sTmp is input
-      T        *sTmp2   = sW2 + batchid * (Q_ * Q_);  // sTmp2 is output
-      for (int j = 0; j < Q_; j++) {
-        rTmp = 0.0;
-        for (int i = 0; i < P_; i++) {
-          rTmp += sTmp(tx_, i, sld) * sT(i, j);
-        }
-        sTmp2(tx_, j, sld) = rTmp;
-      }
-    }
-    __syncthreads();
-
-    // Batch 1 of (Q_^2xP_) matrices [shmem] times (P_xQ_) matrix [shmem] => Batch 1 of (Q_^2xQ_) matrices [reg]
-    if (tx < (Q_ * Q_)) {
-      // No need to declare batchid = (tx  / Q_^2) = always zero
-      // No need to declare tx_     = (tx_ % Q_^2) = always tx
-      const int sld  = Q_ * Q_;
-      const T  *sT   = (iDIM_ == 2) ? sTgrad : sTinterp;
-      T        *sTmp = sW2;  // sTmp is input
-      for (int j = 0; j < Q_; j++) {
-        rTmp = 0.0;
-        for (int i = 0; i < P_; i++) {
-          rTmp += sTmp(tx, i, sld) * sT(i, j);
-        }
-        rV[iDIM_V][icomp][j] *= beta;
-        rV[iDIM_V][icomp][j] += rTmp;
-      }
-    }
-    __syncthreads();
-  }  // loop over NCOMP_
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ *MAXPQ, MAGMA_MAXTHREADS_3D)) __global__
-    void magma_gradn_3d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU,
-                               const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
-  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
-
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaNoTrans;
-
-  if (elem_id >= nelem) return;
-
-  CeedScalar rU[1][NCOMP][P] = {0.0};  // here DIMU = 1, but might be different for a fused operator
-  CeedScalar rV[1][NCOMP][Q] = {0.0};  // here DIMV = 1, but might be different for a fused operator
-  CeedScalar rTmp            = 0.0;
-
-  // shift global memory pointers by elem stride
-  dU += elem_id * estrdU;
-  dV += elem_id * estrdV;
-
-  // assign shared memory pointers
-  CeedScalar *sTinterp = (CeedScalar *)(shared_data);
-  CeedScalar *sTgrad   = sTinterp + P * Q;
-  CeedScalar *sTmp     = sTgrad + P * Q;
-  sTmp += ty * (max(P * P * P, (P * P * Q) + (P * Q * Q)));
-
-  // read T
-  if (ty == 0) {
-    dread_T_gm2sm<P, Q>(tx, transT, dinterp1d, sTinterp);
-    dread_T_gm2sm<P, Q>(tx, transT, dgrad1d, sTgrad);
-  }
-  __syncthreads();
-
-  // No need to read V ( required only in transposed grad )
-  const CeedScalar beta = 0.0;
-
-  /* read U (idim = 0 for dU, iDIM = 0 for rU) --
-     there is a sync at the end of this function */
-  readU_3d<CeedScalar, P, 1, NCOMP, P, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
-
-  /* first call (iDIM = 0, iDIMU = 0, iDIMV = 0) --
-     output from rV[0][][] into dV (idim = 0) */
-  magma_grad_3d_device<CeedScalar, 1, 1, NCOMP, P, Q, P, Q, 0, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
-  /* there is a sync at the end of magma_grad_3d_device */
-  writeV_3d<CeedScalar, Q, 1, NCOMP, Q, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
-
-  /* second call (iDIM = 1, iDIMU = 0, iDIMV = 0) --
-     output from rV[0][][] into dV (idim = 1) */
-  magma_grad_3d_device<CeedScalar, 1, 1, NCOMP, P, Q, P, Q, 1, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
-  /* there is a sync at the end of magma_grad_3d_device */
-  writeV_3d<CeedScalar, Q, 1, NCOMP, Q, 0>(dV + (1 * dstrdV), cstrdV, rV, tx);
-
-  /* third call (iDIM = 2, iDIMU = 0, iDIMV = 0) --
-     output from rV[0][][] into dV (idim = 2) */
-  magma_grad_3d_device<CeedScalar, 1, 1, NCOMP, P, Q, P, Q, 2, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
-  /* there is a sync at the end of magma_grad_3d_device */
-  writeV_3d<CeedScalar, Q, 1, NCOMP, Q, 0>(dV + (2 * dstrdV), cstrdV, rV, tx);
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ *MAXPQ, MAGMA_MAXTHREADS_3D)) __global__
-    void magma_gradt_3d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU,
-                               const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
-  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
-
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaTrans;
-
-  if (elem_id >= nelem) return;
-
-  CeedScalar rU[1][NCOMP][Q] = {0.0};  // here DIMU = 1, but might be different for a fused operator
-  CeedScalar rV[1][NCOMP][P] = {0.0};  // here DIMV = 1, but might be different for a fused operator
-  CeedScalar rTmp            = 0.0;
-
-  // shift global memory pointers by elem stride
-  dU += elem_id * estrdU;
-  dV += elem_id * estrdV;
-
-  // assign shared memory pointers
-  CeedScalar *sTinterp = (CeedScalar *)(shared_data);
-  CeedScalar *sTgrad   = sTinterp + Q * P;
-  CeedScalar *sTmp     = sTgrad + Q * P;
-  sTmp += ty * (max(Q * Q * Q, (Q * Q * P) + (Q * P * P)));
-
-  // read T
-  if (ty == 0) {
-    dread_T_gm2sm<Q, P>(tx, transT, dinterp1d, sTinterp);
-    dread_T_gm2sm<Q, P>(tx, transT, dgrad1d, sTgrad);
-  }
-  __syncthreads();
-
-  // read V (since this is transposed mode)
-  const CeedScalar beta = 1.0;
-  readV_3d<CeedScalar, P, 1, NCOMP, P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
-
-  /* read U (idim = 0 for dU, iDIM = 0 for rU) --
-     there is a sync at the end of this function */
-  readU_3d<CeedScalar, Q, 1, NCOMP, Q, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
-  /* then first call (iDIM = 0, iDIMU = 0, iDIMV = 0) */
-  magma_grad_3d_device<CeedScalar, 1, 1, NCOMP, Q, P, Q, P, 0, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
-  /* there is a sync at the end of magma_grad_3d_device */
-
-  /* read U (idim = 1 for dU, iDIM = 0 for rU) --
-     there is a sync at the end of this function */
-  readU_3d<CeedScalar, Q, 1, NCOMP, Q, 0>(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx);
-  /* then second call (iDIM = 1, iDIMU = 0, iDIMV = 0) */
-  magma_grad_3d_device<CeedScalar, 1, 1, NCOMP, Q, P, Q, P, 1, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
-  /* there is a sync at the end of magma_grad_3d_device */
-
-  /* read U (idim = 2 for dU, iDIM = 0 for rU) --
-     there is a sync at the end of this function */
-  readU_3d<CeedScalar, Q, 1, NCOMP, Q, 0>(dU + (2 * dstrdU), cstrdU, rU, sTmp, tx);
-  /* then third call (iDIM = 2, iDIMU = 0, iDIMV = 0) */
-  magma_grad_3d_device<CeedScalar, 1, 1, NCOMP, Q, P, Q, P, 2, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
-  /* there is a sync at the end of magma_grad_3d_device */
-
-  // write V
-  writeV_3d<CeedScalar, P, 1, NCOMP, P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
-}
diff --git a/include/ceed/jit-source/magma/grad-nontensor.h b/include/ceed/jit-source/magma/grad-nontensor.h
deleted file mode 100644
index 164f2c75a5..0000000000
--- a/include/ceed/jit-source/magma/grad-nontensor.h
+++ /dev/null
@@ -1,120 +0,0 @@
-// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_MAGMA_GRAD_NONTENSOR_H
-#define CEED_MAGMA_GRAD_NONTENSOR_H
-
-////////////////////////////////////////////////////////////////////////////////
-// Different A's and C's, same B
-extern "C" __global__ __launch_bounds__(Q *MAGMA_NONTENSOR_BASIS_NTCOL(Q)) void magma_grad_nontensor_n(magma_trans_t transA, magma_trans_t transB,
-                                                                                                       int n, CeedScalar const *dA, int ldda,
-                                                                                                       CeedScalar const *dB, int lddb, CeedScalar *dC,
-                                                                                                       int lddc) {
-  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
-
-  const int tx      = threadIdx.x;
-  const int ty      = threadIdx.y;
-  const int bx      = blockIdx.x;
-  const int id      = bx * blockDim.y + ty;
-  const int nblocks = MAGMA_CEILDIV(n, NB_GRAD_N);
-  const int myn     = min(NB_GRAD_N, n - id * NB_GRAD_N);
-
-  const double alpha = MAGMA_D_ONE;
-
-  dB += id * NB_GRAD_N * lddb;
-  dC += id * NB_GRAD_N * lddc;
-
-  // A is P x Q
-  const int   slda = P;
-  const int   sldb = P;
-  CeedScalar *sA   = (CeedScalar *)(shared_data);
-  CeedScalar *sB   = sA + Q * P;
-  sB += ty * sldb * NB_GRAD_N;
-
-  // read B once for all C's
-  if (id < nblocks) {
-    read_B_g2s_1D_nosync<CeedScalar, Q, NB_GRAD_N, P>(tx, myn, dB, lddb, sB, sldb);
-  }
-  __syncthreads();
-
-  // unrolling this loop yields dramatic performance drop using hipcc
-  // let the compiler decide (no pragma unroll)
-  for (int idim = 0; idim < DIM; idim++) {
-    // read A (P x Q) using all threads
-    CeedScalar rA[P] = {MAGMA_D_ZERO};
-    read_A_trans_g2r_1D_nosync<CeedScalar, Q, NB_GRAD_N, P>(tx, ty, dA, ldda, sA, slda, rA);
-
-    __syncthreads();
-
-    // init rC
-    CeedScalar rC[NB_GRAD_N] = {MAGMA_D_ZERO};
-    if (id < nblocks) {
-      mul_rAsBrC_1D_nosync<CeedScalar, Q, NB_GRAD_N, P>(tx, alpha, rA, sB, sldb, rC);
-    }
-    __syncthreads();
-
-    if (id < nblocks) {
-      write_C_r2g_1D_nosync<CeedScalar, Q, NB_GRAD_N, P>(tx, myn, rC, dC, lddc);
-    }
-
-    dA += Q * P;
-    dC += Q * n;
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Different A's and B's, same C
-extern "C" __global__ __launch_bounds__(P *MAGMA_NONTENSOR_BASIS_NTCOL(P)) void magma_grad_nontensor_t(magma_trans_t transA, magma_trans_t transB,
-                                                                                                       int n, CeedScalar const *dA, int ldda,
-                                                                                                       CeedScalar const *dB, int lddb, CeedScalar *dC,
-                                                                                                       int lddc) {
-  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
-
-  const int tx      = threadIdx.x;
-  const int ty      = threadIdx.y;
-  const int bx      = blockIdx.x;
-  const int id      = bx * blockDim.y + ty;
-  const int nblocks = MAGMA_CEILDIV(n, NB_GRAD_T);
-  const int myn     = min(NB_GRAD_T, n - id * NB_GRAD_T);
-  if (id >= nblocks) return;
-
-  dB += id * NB_GRAD_T * lddb;
-  dC += id * NB_GRAD_T * lddc;
-
-  const double alpha = MAGMA_D_ONE;
-
-  // A is P x Q
-  const int   sldb = Q;
-  CeedScalar *sB   = (CeedScalar *)(shared_data);
-  sB += ty * sldb * NB_GRAD_T;
-
-  // init rC
-  CeedScalar rC[NB_GRAD_T] = {MAGMA_D_ZERO};
-
-  CeedScalar rA[Q] = {MAGMA_D_ZERO};
-
-  // unrolling this loop yields dramatic performance drop using hipcc
-  // let the compiler decide (no pragma unroll)
-  for (int idim = 0; idim < DIM; idim++) {
-    __syncthreads();
-    // read A
-    read_A_notrans_g2r_1D_nosync<CeedScalar, P, NB_GRAD_T, Q>(tx, dA, ldda, NULL, 0, rA);
-
-    // read B
-    read_B_g2s_1D_nosync<CeedScalar, P, NB_GRAD_T, Q>(tx, myn, dB, lddb, sB, sldb);
-    __syncthreads();
-
-    mul_rAsBrC_1D_nosync<CeedScalar, P, NB_GRAD_T, Q>(tx, alpha, rA, sB, sldb, rC);
-
-    // advance A and B
-    dA += P * Q;
-    dB += Q * n;
-  }
-  write_C_r2g_1D_nosync<CeedScalar, P, NB_GRAD_T, Q>(tx, myn, rC, dC, lddc);
-}
-
-#endif  // CEED_MAGMA_GRAD_NONTENSOR_H
diff --git a/include/ceed/jit-source/magma/interp-1d.h b/include/ceed/jit-source/magma/interp-1d.h
deleted file mode 100644
index 3ca89e3c92..0000000000
--- a/include/ceed/jit-source/magma/interp-1d.h
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-// macros to abstract access of shared memory and reg. file
-#define sT(i, j) sT[(j)*P_ + (i)]
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// interp basis action (1D)
-template <typename T, int DIM_, int NCOMP_, int P_, int Q_>
-static __device__ __inline__ void magma_interp_1d_device(const T *sT, magma_trans_t transT, T *sU[NCOMP_], T *sV[NCOMP_], const int tx) {
-  // Assumptions
-  // 1. 1D threads of size max(P_,Q_)
-  // 2. sU[i] is 1xP_: in shared memory
-  // 3. sV[i] is 1xQ_: in shared memory
-  // 4. P_roduct per component is one row (1xP_) times T matrix (P_xQ_) => one row (1xQ_)
-  // 5. Each thread computes one entry in sV[i]
-  // 6. Must sync before and after call
-  // 7. Note that the layout for U and V is different from 2D/3D problem
-
-  T rv;
-  if (tx < Q_) {
-    for (int icomp = 0; icomp < NCOMP_; icomp++) {
-      rv = (transT == MagmaTrans) ? sV[icomp][tx] : 0.0;
-      for (int i = 0; i < P_; i++) {
-        rv += sU[icomp][i] * sT(i, tx);  // sT[tx * P_ + i];
-      }
-      sV[icomp][tx] = rv;
-    }
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_1D)) __global__
-    void magma_interpn_1d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
-                                 const int cstrdV, const int nelem) {
-  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
-
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaNoTrans;
-
-  if (elem_id >= nelem) return;
-
-  CeedScalar *sU[NCOMP];
-  CeedScalar *sV[NCOMP];
-
-  // shift global memory pointers by elem stride
-  dU += elem_id * estrdU;
-  dV += elem_id * estrdV;
-
-  // assign shared memory pointers
-  CeedScalar *sT = (CeedScalar *)(shared_data);
-  CeedScalar *sW = sT + P * Q;
-  sU[0]          = sW + ty * NCOMP * (P + Q);
-  sV[0]          = sU[0] + (NCOMP * 1 * P);
-  for (int icomp = 1; icomp < NCOMP; icomp++) {
-    sU[icomp] = sU[icomp - 1] + (1 * P);
-    sV[icomp] = sV[icomp - 1] + (1 * Q);
-  }
-
-  // read T
-  if (ty == 0) {
-    dread_T_gm2sm<P, Q>(tx, transT, dT, sT);
-  }
-
-  // read U
-  read_1d<CeedScalar, P, NCOMP>(dU, cstrdU, sU, tx);
-
-  __syncthreads();
-  magma_interp_1d_device<CeedScalar, DIM, NCOMP, P, Q>(sT, transT, sU, sV, tx);
-  __syncthreads();
-
-  // write V
-  write_1d<CeedScalar, Q, NCOMP>(sV, dV, cstrdV, tx);
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_1D)) __global__
-    void magma_interpt_1d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
-                                 const int cstrdV, const int nelem) {
-  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
-
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaTrans;
-
-  if (elem_id >= nelem) return;
-
-  CeedScalar *sU[NCOMP];
-  CeedScalar *sV[NCOMP];
-
-  // shift global memory pointers by elem stride
-  dU += elem_id * estrdU;
-  dV += elem_id * estrdV;
-
-  // assign shared memory pointers
-  CeedScalar *sT = (CeedScalar *)(shared_data);
-  CeedScalar *sW = sT + Q * P;
-  sU[0]          = sW + ty * NCOMP * (Q + P);
-  sV[0]          = sU[0] + (NCOMP * 1 * Q);
-  for (int icomp = 1; icomp < NCOMP; icomp++) {
-    sU[icomp] = sU[icomp - 1] + (1 * Q);
-    sV[icomp] = sV[icomp - 1] + (1 * P);
-  }
-
-  // read T
-  if (ty == 0) {
-    dread_T_gm2sm<Q, P>(tx, transT, dT, sT);
-  }
-
-  // read U
-  read_1d<CeedScalar, Q, NCOMP>(dU, cstrdU, sU, tx);
-
-  // read V
-  read_1d<CeedScalar, P, NCOMP>(dV, cstrdV, sV, tx);
-
-  __syncthreads();
-  magma_interp_1d_device<CeedScalar, DIM, NCOMP, Q, P>(sT, transT, sU, sV, tx);
-  __syncthreads();
-
-  // write V
-  write_1d<CeedScalar, P, NCOMP>(sV, dV, cstrdV, tx);
-}
diff --git a/include/ceed/jit-source/magma/interp-2d.h b/include/ceed/jit-source/magma/interp-2d.h
deleted file mode 100644
index 901128baa2..0000000000
--- a/include/ceed/jit-source/magma/interp-2d.h
+++ /dev/null
@@ -1,146 +0,0 @@
-// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-// macros to abstract access of shared memory and reg. file
-#define sT(i, j) sT[(j)*P_ + (i)]
-#define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)]
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// interp basis action (2D)
-template <typename T, int DIM_U, int DIM_V, int NCOMP_, int P_, int Q_, int rUsize, int rVsize>
-static __device__ __inline__ void magma_interp_2d_device(const T *sT, magma_trans_t transT, T rU[DIM_U][NCOMP_][rUsize], T rV[DIM_V][NCOMP_][rVsize],
-                                                         const int tx, T rTmp, T *swork) {
-  // Assumptions
-  // 1. 1D threads of size max(P_,Q_)
-  // 2. input:  rU[DIM_U x NCOMP_ x rUsize] in registers (per thread)
-  // 3. output: rV[DIM_V x NCOMP_ x rVsize] in registers (per thread)
-  // 4. Two products per component
-  //  4.1 Batch P_ of (1xP_) matrices times (P_xQ_) matrix => Batch P_ of (1xQ_) matrices
-  //  4.2 Batch 1 of (Q_xP_) matrix   times (P_xQ_) matrix => (Q_xQ_) matrix
-  // 5. Each thread computes one row of the output of each product
-  // 6. Sync is recommended before and after the call
-
-  for (int icomp = 0; icomp < NCOMP_; icomp++) {
-    // 1st product -- Batch P_ of (1xP_) matrices [reg] x (P_xQ_) [shmem] => Batch P_ of (1xQ_) matrices
-    // the batch output P_ x (1xQ_) is written on the fly to shmem
-    if (tx < P_) {
-      const int batchid = tx;
-      const int sld     = 1;
-      T        *sTmp    = swork + batchid * (1 * Q_);
-      for (int j = 0; j < Q_; j++) {
-        rTmp = 0.0;
-        for (int i = 0; i < P_; i++) {
-          rTmp += rU[0][icomp][i] * sT(i, j);
-        }
-        sTmp(0, j, sld) = rTmp;
-      }
-    }  // end of: if (tx < P_)
-    __syncthreads();
-
-    // 2nd product -- Batch 1 of a (Q_xP_) matrix [shmem] x (P_xQ_) [shmem] => (Q_xQ_) matrix [reg]
-    if (tx < Q_) {
-      const int batchid = 0;
-      const int sld     = Q_;
-      T        *sTmp    = swork + batchid * (Q_ * P_);
-      for (int j = 0; j < Q_; j++) {
-        rTmp = 0.0;
-        for (int i = 0; i < P_; i++) {
-          rTmp += sTmp(tx, i, sld) * sT(i, j);
-        }
-        rV[0][icomp][j] += rTmp;
-      }
-    }
-    __syncthreads();
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_2D)) __global__
-    void magma_interpn_2d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
-                                 const int cstrdV, const int nelem) {
-  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
-
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaNoTrans;
-
-  if (elem_id >= nelem) return;
-
-  CeedScalar rU[1][NCOMP][P] = {0.0};  // for a non fused operator DIM is always 1
-  CeedScalar rV[1][NCOMP][Q] = {0.0};  // for a non fused operator DIM is always 1
-  CeedScalar rTmp            = 0.0;
-
-  // shift global memory pointers by elem stride
-  dU += elem_id * estrdU;
-  dV += elem_id * estrdV;
-
-  // assign shared memory pointers
-  CeedScalar *sT   = (CeedScalar *)(shared_data);
-  CeedScalar *sTmp = sT + P * Q;
-  sTmp += ty * (P * MAXPQ);
-
-  // read T
-  if (ty == 0) {
-    dread_T_gm2sm<P, Q>(tx, transT, dT, sT);
-  }
-
-  // read U -- there is a sync at the end of this function
-  readU_2d<CeedScalar, P, 1, NCOMP, P, 0>(dU, cstrdU, rU, sTmp, tx);
-
-  // no sync needed here -- readU_2d already syncs at the end
-  magma_interp_2d_device<CeedScalar, 1, 1, NCOMP, P, Q, P, Q>(sT, transT, rU, rV, tx, rTmp, sTmp);
-  __syncthreads();
-
-  // write V
-  writeV_2d<CeedScalar, Q, 1, NCOMP, Q, 0>(dV, cstrdV, rV, tx);
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_2D)) __global__
-    void magma_interpt_2d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
-                                 const int cstrdV, const int nelem) {
-  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
-
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaTrans;
-
-  if (elem_id >= nelem) return;
-
-  CeedScalar rU[1][NCOMP][Q] = {0.0};  // for a non fused operator DIM is always 1
-  CeedScalar rV[1][NCOMP][P] = {0.0};  // for a non fused operator DIM is always 1
-  CeedScalar rTmp            = 0.0;
-
-  // shift global memory pointers by elem stride
-  dU += elem_id * estrdU;
-  dV += elem_id * estrdV;
-
-  // assign shared memory pointers
-  CeedScalar *sT   = (CeedScalar *)(shared_data);
-  CeedScalar *sTmp = sT + Q * P;
-  sTmp += ty * (Q * MAXPQ);
-
-  // read T
-  if (ty == 0) {
-    dread_T_gm2sm<Q, P>(tx, transT, dT, sT);
-  }
-
-  // read V
-  readV_2d<CeedScalar, P, 1, NCOMP, P, 0>(dV, cstrdV, rV, tx);
-
-  // read U -- there is a sync at the end of this function
-  readU_2d<CeedScalar, Q, 1, NCOMP, Q, 0>(dU, cstrdU, rU, sTmp, tx);
-
-  // no sync needed here -- readU_2d already syncs at the end
-  magma_interp_2d_device<CeedScalar, 1, 1, NCOMP, Q, P, Q, P>(sT, transT, rU, rV, tx, rTmp, sTmp);
-  __syncthreads();
-
-  // write V
-  writeV_2d<CeedScalar, P, 1, NCOMP, P, 0>(dV, cstrdV, rV, tx);
-}
diff --git a/include/ceed/jit-source/magma/interp-3d.h b/include/ceed/jit-source/magma/interp-3d.h
deleted file mode 100644
index a886910a3d..0000000000
--- a/include/ceed/jit-source/magma/interp-3d.h
+++ /dev/null
@@ -1,174 +0,0 @@
-// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-// macros to abstract access of shared memory and reg. file
-#define sT(i, j) sT[(j)*P_ + (i)]
-#define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)]
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// interp basis action (3D)
-template <typename T, int DIM_U, int DIM_V, int NCOMP_, int P_, int Q_, int rUsize, int rVsize>
-static __device__ __inline__ void magma_interp_3d_device(const T *sT, magma_trans_t transT, T rU[DIM_U][NCOMP_][rUsize], T rV[DIM_V][NCOMP_][rVsize],
-                                                         const int tx, T rTmp[Q_], T *swork) {
-  // Assumptions
-  // 1. 1D threads of size max(P_,Q_)^2
-  // 2. input:  rU[DIM_U x NCOMP_ x rUsize] in registers (per thread)
-  // 3. output: rV[DIM_V x NCOMP_ x rVsize] in registers (per thread)
-  // 4. Three products per component
-  //  4.1 Batch P_^2 of (1xP_) matrices times (P_xQ_) matrix => Batch P_^2 of (1xQ_) matrices
-  //  4.2 Batch P_   of (Q_xP_) matrices times (P_xQ_) matrix => Batch P_   of (Q_xQ_) matrices
-  //  4.3 Batch 1   of (Q_^2xP_) matrix times (P_xQ_) matrix => (Q_^2xQ_) matrix
-  // 5. Each thread computes one row of the output of each product
-  // 6. Sync is recommended before and after the call
-
-  for (int icomp = 0; icomp < NCOMP_; icomp++) {
-    // Batch P_^2 of (1xP_) matrices [reg] times (P_xQ_) matrix [shmem] => Batch P_^2 of (1xQ_) matrices [shmem]
-    if (tx < (P_ * P_)) {
-      const int batchid = tx;
-      const int sld     = 1;
-      T        *sTmp    = swork + batchid * (1 * Q_);
-      for (int j = 0; j < Q_; j++) {
-        rTmp[0] = 0.0;
-        for (int i = 0; i < P_; i++) {
-          rTmp[0] += rU[0][icomp][i] * sT(i, j);
-        }
-        sTmp(0, j, sld) = rTmp[0];
-      }
-    }  // end of: if (tx < P_*P_)
-    __syncthreads();
-
-    // Batch P_ of (Q_xP_) matrices [shmem] times (P_xQ_) matrix [shmem] => Batch P_ of (Q_xQ_) matrices [reg]
-    if (tx < (P_ * Q_)) {
-      const int batchid = tx / Q_;
-      const int tx_     = tx % Q_;
-      const int sld     = Q_;
-      T        *sTmp    = swork + batchid * (Q_ * P_);  // sTmp is input
-      for (int j = 0; j < Q_; j++) {
-        rTmp[j] = 0.0;
-        for (int i = 0; i < P_; i++) {
-          rTmp[j] += sTmp(tx_, i, sld) * sT(i, j);
-        }
-      }
-    }
-    __syncthreads();
-
-    // write rTmp[] into shmem as batch P_ of Q_xQ_ matrices
-    if (tx < (P_ * Q_)) {
-      const int batchid = tx / Q_;
-      const int tx_     = tx % Q_;
-      const int sld     = Q_;
-      T        *sTmp    = swork + batchid * (Q_ * Q_);
-      for (int j = 0; j < Q_; j++) {
-        sTmp(tx_, j, sld) = rTmp[j];
-      }
-    }
-    __syncthreads();
-
-    // Batch 1 of (Q_^2xP_) matrices [shmem] times (P_xQ_) matrix [shmem] => Batch 1 of (Q_^2xQ_) matrices [reg]
-    if (tx < (Q_ * Q_)) {
-      // No need to declare batchid = (tx  / Q_^2) = always zero
-      // No need to declare tx_     = (tx_ % Q_^2) = always tx
-      const int sld  = Q_ * Q_;
-      T        *sTmp = swork;
-      for (int j = 0; j < Q_; j++) {
-        rTmp[0] = 0.0;
-        for (int i = 0; i < P_; i++) {
-          rTmp[0] += sTmp(tx, i, sld) * sT(i, j);
-        }
-        rV[0][icomp][j] += rTmp[0];
-      }
-    }
-    __syncthreads();
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ *MAXPQ, MAGMA_MAXTHREADS_3D)) __global__
-    void magma_interpn_3d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
-                                 const int cstrdV, const int nelem) {
-  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
-
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaNoTrans;
-
-  if (elem_id >= nelem) return;
-
-  CeedScalar rU[1][NCOMP][P] = {0.0};  // for a non fused operator DIM is always 1
-  CeedScalar rV[1][NCOMP][Q] = {0.0};  // for a non fused operator DIM is always 1
-  CeedScalar rTmp[Q]         = {0.0};
-
-  // shift global memory pointers by elem stride
-  dU += elem_id * estrdU;
-  dV += elem_id * estrdV;
-
-  // assign shared memory pointers
-  CeedScalar *sT   = (CeedScalar *)(shared_data);
-  CeedScalar *sTmp = sT + P * Q;
-  sTmp += ty * (max(P * P * MAXPQ, P * Q * Q));
-
-  // read T
-  if (ty == 0) {
-    dread_T_gm2sm<P, Q>(tx, transT, dT, sT);
-  }
-
-  // read U (idim = 0 for dU, iDIM = 0 for rU, u_dimstride is always 0)
-  readU_3d<CeedScalar, P, 1, NCOMP, P, 0>(dU, cstrdU, rU, sTmp, tx);
-  // there is a sync at the end of this function
-
-  magma_interp_3d_device<CeedScalar, 1, 1, NCOMP, P, Q, P, Q>(sT, transT, rU, rV, tx, rTmp, sTmp);
-  __syncthreads();
-
-  // write V
-  writeV_3d<CeedScalar, Q, 1, NCOMP, Q, 0>(dV, cstrdV, rV, tx);
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ *MAXPQ, MAGMA_MAXTHREADS_3D)) __global__
-    void magma_interpt_3d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
-                                 const int cstrdV, const int nelem) {
-  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
-
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaTrans;
-
-  if (elem_id >= nelem) return;
-
-  CeedScalar rU[1][NCOMP][Q] = {0.0};  // for a non fused operator DIM is always 1
-  CeedScalar rV[1][NCOMP][P] = {0.0};  // for a non fused operator DIM is always 1
-  CeedScalar rTmp[P]         = {0.0};
-
-  // shift global memory pointers by elem stride
-  dU += elem_id * estrdU;
-  dV += elem_id * estrdV;
-
-  // assign shared memory pointers
-  CeedScalar *sT   = (CeedScalar *)(shared_data);
-  CeedScalar *sTmp = sT + Q * P;
-  sTmp += ty * (max(Q * Q * MAXPQ, Q * P * P));
-
-  // read T
-  if (ty == 0) {
-    dread_T_gm2sm<Q, P>(tx, transT, dT, sT);
-  }
-
-  // read V
-  readV_3d<CeedScalar, P, 1, NCOMP, P, 0>(dV, cstrdV, rV, tx);
-
-  // read U (idim = 0 for dU, iDIM = 0 for rU, u_dimstride is always 0)
-  readU_3d<CeedScalar, Q, 1, NCOMP, Q, 0>(dU, cstrdU, rU, sTmp, tx);
-  // there is a sync at the end of this function
-
-  magma_interp_3d_device<CeedScalar, 1, 1, NCOMP, Q, P, Q, P>(sT, transT, rU, rV, tx, rTmp, sTmp);
-  __syncthreads();
-
-  // write V
-  writeV_3d<CeedScalar, P, 1, NCOMP, P, 0>(dV, cstrdV, rV, tx);
-}
diff --git a/include/ceed/jit-source/magma/interp-nontensor.h b/include/ceed/jit-source/magma/interp-nontensor.h
deleted file mode 100644
index e715986a74..0000000000
--- a/include/ceed/jit-source/magma/interp-nontensor.h
+++ /dev/null
@@ -1,96 +0,0 @@
-// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_MAGMA_INTERP_NONTENSOR_H
-#define CEED_MAGMA_INTERP_NONTENSOR_H
-
-////////////////////////////////////////////////////////////////////////////////
-extern "C" __global__ __launch_bounds__(Q *MAGMA_NONTENSOR_BASIS_NTCOL(Q)) void magma_interp_nontensor_n(
-    magma_trans_t transA, magma_trans_t transB, int n, const CeedScalar alpha, CeedScalar const *dA, int ldda, CeedScalar const *dB, int lddb,
-    const CeedScalar beta, CeedScalar *dC, int lddc) {
-  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
-
-  const int tx      = threadIdx.x;
-  const int ty      = threadIdx.y;
-  const int bx      = blockIdx.x;
-  const int id      = bx * blockDim.y + ty;
-  const int nblocks = MAGMA_CEILDIV(n, NB_INTERP_N);
-  const int myn     = min(NB_INTERP_N, n - id * NB_INTERP_N);
-
-  // const bool irrblock = ( myn != NB_INTERP_N );
-
-  dB += id * NB_INTERP_N * lddb;
-  dC += id * NB_INTERP_N * lddc;
-
-  const int   slda = P;
-  const int   sldb = P;
-  CeedScalar *sA   = (CeedScalar *)(shared_data);
-  CeedScalar *sB   = sA;
-  sB += ty * sldb * NB_INTERP_N;
-
-  // read A using all threads
-  CeedScalar rA[P] = {MAGMA_D_ZERO};
-  read_A_trans_g2r_1D_nosync<CeedScalar, Q, NB_INTERP_N, P>(tx, ty, dA, ldda, sA, slda, rA);
-  __syncthreads();
-
-  // terminate threads with no work
-  if (id >= nblocks) return;
-
-  // init rC
-  CeedScalar rC[NB_INTERP_N] = {MAGMA_D_ZERO};
-  read_C_g2r_1D_nosync<CeedScalar, Q, NB_INTERP_N, P>(tx, myn, dC, lddc, beta, rC);
-
-  // read B
-  read_B_g2s_1D_nosync<CeedScalar, Q, NB_INTERP_N, P>(tx, myn, dB, lddb, sB, sldb);
-  __syncthreads();
-
-  mul_rAsBrC_1D_nosync<CeedScalar, Q, NB_INTERP_N, P>(tx, alpha, rA, sB, sldb, rC);
-  write_C_r2g_1D_nosync<CeedScalar, Q, NB_INTERP_N, P>(tx, myn, rC, dC, lddc);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-extern "C" __global__ __launch_bounds__(P *MAGMA_NONTENSOR_BASIS_NTCOL(P)) void magma_interp_nontensor_t(
-    magma_trans_t transA, magma_trans_t transB, int n, const CeedScalar alpha, CeedScalar const *dA, int ldda, CeedScalar const *dB, int lddb,
-    const CeedScalar beta, CeedScalar *dC, int lddc) {
-  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
-
-  const int tx      = threadIdx.x;
-  const int ty      = threadIdx.y;
-  const int bx      = blockIdx.x;
-  const int id      = bx * blockDim.y + ty;
-  const int nblocks = MAGMA_CEILDIV(n, NB_INTERP_T);
-  const int myn     = min(NB_INTERP_T, n - id * NB_INTERP_T);
-  if (id >= nblocks) return;
-
-  dB += id * NB_INTERP_T * lddb;
-  dC += id * NB_INTERP_T * lddc;
-
-  // A is P x Q
-  const int   sldb = Q;
-  CeedScalar *sB   = (CeedScalar *)(shared_data);
-  sB += ty * sldb * NB_INTERP_T;
-
-  // init rC
-  CeedScalar rC[NB_INTERP_T] = {MAGMA_D_ZERO};
-  if (beta != MAGMA_D_ZERO) {
-    read_C_g2r_1D_nosync<CeedScalar, P, NB_INTERP_T, Q>(tx, myn, dC, lddc, beta, rC);
-  }
-
-  // read A
-  CeedScalar rA[Q] = {MAGMA_D_ZERO};
-  read_A_notrans_g2r_1D_nosync<CeedScalar, P, NB_INTERP_T, Q>(tx, dA, ldda, NULL, 0, rA);
-
-  // read B
-  read_B_g2s_1D_nosync<CeedScalar, P, NB_INTERP_T, Q>(tx, myn, dB, lddb, sB, sldb);
-  __syncthreads();
-
-  mul_rAsBrC_1D_nosync<CeedScalar, P, NB_INTERP_T, Q>(tx, alpha, rA, sB, sldb, rC);
-
-  write_C_r2g_1D_nosync<CeedScalar, P, NB_INTERP_T, Q>(tx, myn, rC, dC, lddc);
-}
-
-#endif  // CEED_MAGMA_INTERP_NONTENSOR_H
\ No newline at end of file
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-1d.h b/include/ceed/jit-source/magma/magma-basis-grad-1d.h
new file mode 100644
index 0000000000..89a112115e
--- /dev/null
+++ b/include/ceed/jit-source/magma/magma-basis-grad-1d.h
@@ -0,0 +1,138 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for MAGMA tensor basis gradient in 1D
+#ifndef CEED_MAGMA_BASIS_GRAD_1D_H
+#define CEED_MAGMA_BASIS_GRAD_1D_H
+
+#include "magma-common-tensor.h"
+
+// macros to abstract access of shared memory and reg. file
+#define sT(i, j) sT[(j)*P + (i)]
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// grad basis action (1D)
+template <typename T, int DIM, int NUM_COMP, int P, int Q>
+static __device__ __inline__ void magma_grad_1d_device(const T *sT, magma_trans_t transT, T *sU[NUM_COMP], T *sV[NUM_COMP], const int tx) {
+  // Assumptions
+  // 1. 1D threads of size max(P,Q)
+  // 2. sU[i] is 1xP: in shared memory
+  // 3. sV[i] is 1xQ: in shared memory
+  // 4. P_roduct per component is one row (1xP) times T matrix (PxQ) => one row (1xQ)
+  // 5. Each thread computes one entry in sV[i]
+  // 6. Must sync before and after call
+  // 7. Note that the layout for U and V is different from 2D/3D problem
+
+  T rv;
+  if (tx < Q) {
+    for (int comp = 0; comp < NUM_COMP; comp++) {
+      rv = (transT == MagmaTrans) ? sV[comp][tx] : 0.0;
+      for (int i = 0; i < P; i++) {
+        rv += sU[comp][i] * sT(i, tx);
+      }
+      sV[comp][tx] = rv;
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_gradn_1d_kernel(const CeedScalar *dTinterp, const CeedScalar *dTgrad, const CeedScalar *dU, const int estrdU, const int cstrdU,
+                               const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int     tx      = threadIdx.x;
+  const int     ty      = threadIdx.y;
+  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
+  magma_trans_t transT  = MagmaNoTrans;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar *sU[BASIS_NUM_COMP];
+  CeedScalar *sV[BASIS_NUM_COMP];
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sT = (CeedScalar *)shared_data;
+  CeedScalar *sW = sT + BASIS_P * BASIS_Q;
+  sU[0]          = sW + ty * BASIS_NUM_COMP * (BASIS_P + BASIS_Q);
+  sV[0]          = sU[0] + (BASIS_NUM_COMP * 1 * BASIS_P);
+  for (int comp = 1; comp < BASIS_NUM_COMP; comp++) {
+    sU[comp] = sU[comp - 1] + (1 * BASIS_P);
+    sV[comp] = sV[comp - 1] + (1 * BASIS_Q);
+  }
+
+  // read T
+  if (ty == 0) {
+    dread_T_gm2sm<BASIS_P, BASIS_Q>(tx, transT, dTgrad, sT);
+  }
+
+  // read U
+  read_1d<CeedScalar, BASIS_P, BASIS_NUM_COMP>(dU, cstrdU, sU, tx);
+
+  __syncthreads();
+  magma_grad_1d_device<CeedScalar, BASIS_DIM, BASIS_NUM_COMP, BASIS_P, BASIS_Q>(sT, transT, sU, sV, tx);
+  __syncthreads();
+
+  // write V
+  write_1d<CeedScalar, BASIS_Q, BASIS_NUM_COMP>(sV, dV, cstrdV, tx);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_gradt_1d_kernel(const CeedScalar *dTinterp, const CeedScalar *dTgrad, const CeedScalar *dU, const int estrdU, const int cstrdU,
+                               const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int     tx      = threadIdx.x;
+  const int     ty      = threadIdx.y;
+  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
+  magma_trans_t transT  = MagmaTrans;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar *sU[BASIS_NUM_COMP];
+  CeedScalar *sV[BASIS_NUM_COMP];
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sT = (CeedScalar *)shared_data;
+  CeedScalar *sW = sT + BASIS_Q * BASIS_P;
+  sU[0]          = sW + ty * BASIS_NUM_COMP * (BASIS_Q + BASIS_P);
+  sV[0]          = sU[0] + (BASIS_NUM_COMP * 1 * BASIS_Q);
+  for (int comp = 1; comp < BASIS_NUM_COMP; comp++) {
+    sU[comp] = sU[comp - 1] + (1 * BASIS_Q);
+    sV[comp] = sV[comp - 1] + (1 * BASIS_P);
+  }
+
+  // read T
+  if (ty == 0) {
+    dread_T_gm2sm<BASIS_Q, BASIS_P>(tx, transT, dTgrad, sT);
+  }
+
+  // read U
+  read_1d<CeedScalar, BASIS_Q, BASIS_NUM_COMP>(dU, cstrdU, sU, tx);
+
+  // read V
+  read_1d<CeedScalar, BASIS_P, BASIS_NUM_COMP>(dV, cstrdV, sV, tx);
+
+  __syncthreads();
+  magma_grad_1d_device<CeedScalar, BASIS_DIM, BASIS_NUM_COMP, BASIS_Q, BASIS_P>(sT, transT, sU, sV, tx);
+  __syncthreads();
+
+  // write V
+  write_1d<CeedScalar, BASIS_P, BASIS_NUM_COMP>(sV, dV, cstrdV, tx);
+}
+
+#endif  // CEED_MAGMA_BASIS_GRAD_1D_H
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-2d.h b/include/ceed/jit-source/magma/magma-basis-grad-2d.h
new file mode 100644
index 0000000000..042e41b046
--- /dev/null
+++ b/include/ceed/jit-source/magma/magma-basis-grad-2d.h
@@ -0,0 +1,189 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for MAGMA tensor basis gradient in 2D
+#ifndef CEED_MAGMA_BASIS_GRAD_2D_H
+#define CEED_MAGMA_BASIS_GRAD_2D_H
+
+#include "magma-common-tensor.h"
+
+// macros to abstract access of shared memory and reg. file
+#define sT(i, j) sT[(j)*P + (i)]
+#define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)]
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// grad basis action (2D)
+// This function is called two times at a higher level for 2D
+// DIM_U   -- for the size of rU[DIM_U * NUM_COMP * MAX_P_Q]
+// DIM_V   -- for the size of rV[DIM_V * NUM_COMP * MAX_P_Q]
+// i_DIM   -- the index of the outermost loop over dimensions in grad
+// i_DIM_U -- which dim index of rU is accessed (always 0 for notrans, 0 or 1 for trans)
+// i_DIM_V -- which dim index of rV is accessed (0 or 1 for notrans, always 0 for trans)
+// the scalar beta is used to specify whether to accumulate to rV, or overwrite it
+template <typename T, int DIM_U, int DIM_V, int NUM_COMP, int P, int Q, int rU_SIZE, int rV_SIZE, int i_DIM, int i_DIM_U, int i_DIM_V>
+static __device__ __inline__ void magma_grad_2d_device(const T *sTinterp, const T *sTgrad, T rU[DIM_U][NUM_COMP][rU_SIZE],
+                                                       T rV[DIM_V][NUM_COMP][rV_SIZE], T beta, const int tx, T rTmp, T *swork) {
+  // Assumptions
+  // 0. This device routine applies grad for one dim only (i_DIM), so it should be called twice for 2D
+  // 1. 1D threads of size max(P,Q)
+  // 2. input:  rU[DIM_U x NUM_COMP x P] in registers (per thread)
+  // 3. output: rV[DIM_V x NUM_COMP x Q] in registers (per thread)
+  // 4. Two products per each (dim,component) pair
+  //  4.1 Batch P of (1xP) matrices times (PxQ) matrix => Batch P of (1xQ) matrices
+  //  4.2 Batch 1 of (QxP) matrix   times (PxQ) matrix => (QxQ) matrix
+  // 6. Each thread computes one row of the output of each product
+  // 7. Sync is recommended before and after the call
+
+  for (int comp = 0; comp < NUM_COMP; comp++) {
+    // 1st product -- Batch P of (1xP) matrices [reg] x (PxQ) [shmem] => Batch P of (1xQ) matrices
+    // the batch output P x (1xQ) is written on the fly to shmem
+    if (tx < P) {
+      const int batchid = tx;
+      const int sld     = 1;
+      const T  *sT      = (i_DIM == 0) ? sTgrad : sTinterp;
+      T        *sTmp    = swork + batchid * (1 * Q);
+      for (int j = 0; j < Q; j++) {
+        rTmp = 0.0;
+        for (int i = 0; i < P; i++) {
+          rTmp += rU[i_DIM_U][comp][i] * sT(i, j);
+        }
+        sTmp(0, j, sld) = rTmp;
+      }
+    }  // end of: if (tx < P)
+    __syncthreads();
+
+    // 2nd product -- Batch 1 of a (QxP) matrix [shmem] x (PxQ) [shmem] => (QxQ) matrix [reg]
+    if (tx < Q) {
+      const int batchid = 0;
+      const int sld     = Q;
+      const T  *sT      = (i_DIM == 1) ? sTgrad : sTinterp;
+      T        *sTmp    = swork + batchid * (Q * P);
+      for (int j = 0; j < Q; j++) {
+        rTmp = 0.0;
+        for (int i = 0; i < P; i++) {
+          rTmp += sTmp(tx, i, sld) * sT(i, j);
+        }
+        rV[i_DIM_V][comp][j] *= beta;
+        rV[i_DIM_V][comp][j] += rTmp;
+      }
+    }
+    __syncthreads();
+  }  // loop over NUM_COMP
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__
+    void magma_gradn_2d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU,
+                               const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int     tx      = threadIdx.x;
+  const int     ty      = threadIdx.y;
+  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
+  magma_trans_t transT  = MagmaNoTrans;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar rU[1][BASIS_NUM_COMP][BASIS_P] = {0.0};  // here DIM_U = 1, but might be different for a fused operator
+  CeedScalar rV[1][BASIS_NUM_COMP][BASIS_Q] = {0.0};  // here DIM_V = 1, but might be different for a fused operator
+  CeedScalar rTmp                           = 0.0;
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sTinterp = (CeedScalar *)shared_data;
+  CeedScalar *sTgrad   = sTinterp + BASIS_P * BASIS_Q;
+  CeedScalar *sTmp     = sTgrad + BASIS_P * BASIS_Q;
+  sTmp += ty * (BASIS_P * BASIS_MAX_P_Q);
+
+  // read T
+  if (ty == 0) {
+    dread_T_gm2sm<BASIS_P, BASIS_Q>(tx, transT, dinterp1d, sTinterp);
+    dread_T_gm2sm<BASIS_P, BASIS_Q>(tx, transT, dgrad1d, sTgrad);
+  }
+
+  // No need to read V ( required only in transposed grad )
+  const CeedScalar beta = 0.0;
+
+  /* read U (idim = 0 for dU, i_DIM = 0 for rU) --
+     there is a sync at the end of this function */
+  readU_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
+
+  /* first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) --
+     output from rV[0][][] into dV (idim = 0) */
+  magma_grad_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q, 0, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_2d_device */
+  writeV_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
+
+  /* second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) --
+  output from rV[0][][] into dV (idim = 1) */
+  magma_grad_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q, 1, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_2d_device */
+  writeV_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV + (1 * dstrdV), cstrdV, rV, tx);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__
+    void magma_gradt_2d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU,
+                               const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int     tx      = threadIdx.x;
+  const int     ty      = threadIdx.y;
+  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
+  magma_trans_t transT  = MagmaTrans;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0};  // here DIM_U = 1, but might be different for a fused operator
+  CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0};  // here DIM_V = 1, but might be different for a fused operator
+  CeedScalar rTmp                           = 0.0;
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sTinterp = (CeedScalar *)shared_data;
+  CeedScalar *sTgrad   = sTinterp + BASIS_Q * BASIS_P;
+  CeedScalar *sTmp     = sTgrad + BASIS_Q * BASIS_P;
+  sTmp += ty * (BASIS_Q * BASIS_MAX_P_Q);
+
+  // read T
+  if (ty == 0) {
+    dread_T_gm2sm<BASIS_Q, BASIS_P>(tx, transT, dinterp1d, sTinterp);
+    dread_T_gm2sm<BASIS_Q, BASIS_P>(tx, transT, dgrad1d, sTgrad);
+  }
+  __syncthreads();
+
+  /* read V (since this is transposed mode --
+     idim = 0 for dV, i_DIM = 0 for rV) */
+  const CeedScalar beta = 1.0;
+  readV_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
+
+  /* read U (idim = 0 for dU, i_DIM = 0 for rU) --
+     there is a sync at the end of this function */
+  readU_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
+  /* first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) */
+  magma_grad_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 0, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_2d_device */
+
+  /* read U (idim = 1 for dU, i_DIM = 0 for rU) --
+     there is a sync at the end of this function */
+  readU_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx);
+  /* second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) */
+  magma_grad_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 1, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_2d_device */
+
+  // write V
+  writeV_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
+}
+
+#endif  // CEED_MAGMA_BASIS_GRAD_2D_H
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-3d.h b/include/ceed/jit-source/magma/magma-basis-grad-3d.h
new file mode 100644
index 0000000000..063ee7bc0d
--- /dev/null
+++ b/include/ceed/jit-source/magma/magma-basis-grad-3d.h
@@ -0,0 +1,224 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for MAGMA tensor basis gradient in 3D
+#ifndef CEED_MAGMA_BASIS_GRAD_3D_H
+#define CEED_MAGMA_BASIS_GRAD_3D_H
+
+#include "magma-common-tensor.h"
+
+// macros to abstract access of shared memory and reg. file
+#define sT(i, j) sT[(j)*P + (i)]
+#define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)]
+#define sTmp2(i, j, ldw) sTmp2[(j) * (ldw) + (i)]
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// grad basis action (3D)
+// This function is called three times at a higher level for 3D
+// DIM_U   -- for the size of rU[DIM_U * NUM_COMP * MAX_P_Q]
+// DIM_V   -- for the size of rV[DIM_V * NUM_COMP * MAX_P_Q]
+// i_DIM   -- the index of the outermost loop over dimensions in grad
+// i_DIM_U -- which dim index of rU is accessed (always 0 for notrans, 0, 1, or 2 for trans)
+// i_DIM_V -- which dim index of rV is accessed (0, 1, or 2 for notrans, always 0 for trans)
+// the scalar beta is used to specify whether to accumulate to rV, or overwrite it
+template <typename T, int DIM_U, int DIM_V, int NUM_COMP, int P, int Q, int rU_SIZE, int rV_SIZE, int i_DIM, int i_DIM_U, int i_DIM_V>
+static __device__ __inline__ void magma_grad_3d_device(const T *sTinterp, const T *sTgrad, T rU[DIM_U][NUM_COMP][rU_SIZE],
+                                                       T rV[DIM_V][NUM_COMP][rV_SIZE], T beta, const int tx, T rTmp, T *swork) {
+  // Assumptions
+  // 0. This device routine applies grad for one dim only (i_DIM), so it should be thrice for 3D
+  // 1. 1D threads of size max(P,Q)^2
+  // 2. input:  rU[DIM_U x NUM_COMP x rU_SIZE] in registers (per thread)
+  // 3. output: rV[DIM_V x NUM_COMP x rV_SIZE] in registers (per thread)
+  // 4. Three products per each (dim,component) pair
+  //  4.1 Batch P^2 of (1xP) matrices times (PxQ) matrix => Batch P^2 of (1xQ) matrices
+  //  4.2 Batch P   of (QxP) matrices times (PxQ) matrix => Batch P   of (QxQ) matrices
+  //  4.3 Batch 1   of (Q^2xP_) matrix times (PxQ) matrix => (Q^2xQ_) matrix
+  // 6. Each thread computes one row of the output of each product
+  // 7. Sync is recommended before and after the call
+
+  T *sW1 = swork;
+  T *sW2 = sW1 + P * P * Q;
+  for (int comp = 0; comp < NUM_COMP; comp++) {
+    // Batch P^2 of (1xP) matrices [reg] times (PxQ) matrix [shmem] => Batch P^2 of (1xQ) matrices [shmem]
+    if (tx < (P * P)) {
+      const int batchid = tx;
+      const int sld     = 1;
+      const T  *sT      = (i_DIM == 0) ? sTgrad : sTinterp;
+      T        *sTmp    = sW1 + batchid * (1 * Q);
+      for (int j = 0; j < Q; j++) {
+        rTmp = 0.0;
+        for (int i = 0; i < P; i++) {
+          rTmp += rU[i_DIM_U][comp][i] * sT(i, j);
+        }
+        sTmp(0, j, sld) = rTmp;
+      }
+    }  // end of: if (tx < P*P)
+    __syncthreads();
+
+    // Batch P of (QxP) matrices [shmem] times (PxQ) matrix [shmem] => Batch P of (QxQ) matrices [reg]
+    if (tx < (P * Q)) {
+      const int batchid = tx / Q;
+      const int tx_     = tx % Q;
+      const int sld     = Q;
+      const T  *sT      = (i_DIM == 1) ? sTgrad : sTinterp;
+      T        *sTmp    = sW1 + batchid * (Q * P);  // sTmp is input
+      T        *sTmp2   = sW2 + batchid * (Q * Q);  // sTmp2 is output
+      for (int j = 0; j < Q; j++) {
+        rTmp = 0.0;
+        for (int i = 0; i < P; i++) {
+          rTmp += sTmp(tx_, i, sld) * sT(i, j);
+        }
+        sTmp2(tx_, j, sld) = rTmp;
+      }
+    }
+    __syncthreads();
+
+    // Batch 1 of (Q^2xP_) matrices [shmem] times (PxQ) matrix [shmem] => Batch 1 of (Q^2xQ_) matrices [reg]
+    if (tx < (Q * Q)) {
+      // No need to declare batchid = (tx  / Q^2) = always zero
+      // No need to declare tx_     = (tx_ % Q^2) = always tx
+      const int sld  = Q * Q;
+      const T  *sT   = (i_DIM == 2) ? sTgrad : sTinterp;
+      T        *sTmp = sW2;  // sTmp is input
+      for (int j = 0; j < Q; j++) {
+        rTmp = 0.0;
+        for (int i = 0; i < P; i++) {
+          rTmp += sTmp(tx, i, sld) * sT(i, j);
+        }
+        rV[i_DIM_V][comp][j] *= beta;
+        rV[i_DIM_V][comp][j] += rTmp;
+      }
+    }
+    __syncthreads();
+  }  // loop over NUM_COMP
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__
+    void magma_gradn_3d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU,
+                               const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int     tx      = threadIdx.x;
+  const int     ty      = threadIdx.y;
+  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
+  magma_trans_t transT  = MagmaNoTrans;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar rU[1][BASIS_NUM_COMP][BASIS_P] = {0.0};  // here DIM_U = 1, but might be different for a fused operator
+  CeedScalar rV[1][BASIS_NUM_COMP][BASIS_Q] = {0.0};  // here DIM_V = 1, but might be different for a fused operator
+  CeedScalar rTmp                           = 0.0;
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sTinterp = (CeedScalar *)shared_data;
+  CeedScalar *sTgrad   = sTinterp + BASIS_P * BASIS_Q;
+  CeedScalar *sTmp     = sTgrad + BASIS_P * BASIS_Q;
+  sTmp += ty * (max(BASIS_P * BASIS_P * BASIS_P, (BASIS_P * BASIS_P * BASIS_Q) + (BASIS_P * BASIS_Q * BASIS_Q)));
+
+  // read T
+  if (ty == 0) {
+    dread_T_gm2sm<BASIS_P, BASIS_Q>(tx, transT, dinterp1d, sTinterp);
+    dread_T_gm2sm<BASIS_P, BASIS_Q>(tx, transT, dgrad1d, sTgrad);
+  }
+  __syncthreads();
+
+  // No need to read V ( required only in transposed grad )
+  const CeedScalar beta = 0.0;
+
+  /* read U (idim = 0 for dU, i_DIM = 0 for rU) --
+     there is a sync at the end of this function */
+  readU_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
+
+  /* first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) --
+     output from rV[0][][] into dV (idim = 0) */
+  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q, 0, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_3d_device */
+  writeV_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
+
+  /* second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) --
+     output from rV[0][][] into dV (idim = 1) */
+  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q, 1, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_3d_device */
+  writeV_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV + (1 * dstrdV), cstrdV, rV, tx);
+
+  /* third call (i_DIM = 2, i_DIM_U = 0, i_DIM_V = 0) --
+     output from rV[0][][] into dV (idim = 2) */
+  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q, 2, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_3d_device */
+  writeV_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV + (2 * dstrdV), cstrdV, rV, tx);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__
+    void magma_gradt_3d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU,
+                               const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int     tx      = threadIdx.x;
+  const int     ty      = threadIdx.y;
+  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
+  magma_trans_t transT  = MagmaTrans;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0};  // here DIM_U = 1, but might be different for a fused operator
+  CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0};  // here DIM_V = 1, but might be different for a fused operator
+  CeedScalar rTmp                           = 0.0;
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sTinterp = (CeedScalar *)shared_data;
+  CeedScalar *sTgrad   = sTinterp + BASIS_Q * BASIS_P;
+  CeedScalar *sTmp     = sTgrad + BASIS_Q * BASIS_P;
+  sTmp += ty * (max(BASIS_Q * BASIS_Q * BASIS_Q, (BASIS_Q * BASIS_Q * BASIS_P) + (BASIS_Q * BASIS_P * BASIS_P)));
+
+  // read T
+  if (ty == 0) {
+    dread_T_gm2sm<BASIS_Q, BASIS_P>(tx, transT, dinterp1d, sTinterp);
+    dread_T_gm2sm<BASIS_Q, BASIS_P>(tx, transT, dgrad1d, sTgrad);
+  }
+  __syncthreads();
+
+  // read V (since this is transposed mode)
+  const CeedScalar beta = 1.0;
+  readV_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
+
+  /* read U (idim = 0 for dU, i_DIM = 0 for rU) --
+     there is a sync at the end of this function */
+  readU_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
+  /* then first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) */
+  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 0, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_3d_device */
+
+  /* read U (idim = 1 for dU, i_DIM = 0 for rU) --
+     there is a sync at the end of this function */
+  readU_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx);
+  /* then second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) */
+  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 1, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_3d_device */
+
+  /* read U (idim = 2 for dU, i_DIM = 0 for rU) --
+     there is a sync at the end of this function */
+  readU_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (2 * dstrdU), cstrdU, rU, sTmp, tx);
+  /* then third call (i_DIM = 2, i_DIM_U = 0, i_DIM_V = 0) */
+  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 2, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_3d_device */
+
+  // write V
+  writeV_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
+}
+
+#endif  // CEED_MAGMA_BASIS_GRAD_3D_H
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-nontensor.h b/include/ceed/jit-source/magma/magma-basis-grad-nontensor.h
new file mode 100644
index 0000000000..95bf548fc3
--- /dev/null
+++ b/include/ceed/jit-source/magma/magma-basis-grad-nontensor.h
@@ -0,0 +1,110 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for Magma non-tensor basis gradient
+#ifndef CEED_MAGMA_GRAD_NONTENSOR_H
+#define CEED_MAGMA_GRAD_NONTENSOR_H
+
+#include "magma-common-nontensor.h"
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_grad_nontensor_n(int n, CeedScalar const *dA, int ldda, CeedScalar const *dB, int lddb, CeedScalar *dC, int lddc) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
+
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int id      = blockIdx.x * blockDim.y + ty;
+  const int nblocks = MAGMA_CEILDIV(n, BASIS_NB_GRAD_N);
+  const int myn     = min(BASIS_NB_GRAD_N, n - id * BASIS_NB_GRAD_N);
+
+  dB += id * BASIS_NB_GRAD_N * lddb;
+  dC += id * BASIS_NB_GRAD_N * lddc;
+
+  // A is BASIS_P x BASIS_Q
+  const int   slda = BASIS_P;
+  const int   sldb = BASIS_P;
+  CeedScalar *sA   = (CeedScalar *)shared_data;
+  CeedScalar *sB   = sA + BASIS_Q * BASIS_P;
+  sB += ty * sldb * BASIS_NB_GRAD_N;
+
+  // read B once for all C's
+  if (id < nblocks) {
+    read_B_g2s_1D_nosync<CeedScalar, BASIS_Q, BASIS_P, BASIS_NB_GRAD_N>(tx, myn, dB, lddb, sB, sldb);
+  }
+
+  // unrolling this loop yields dramatic performance drop using hipcc, let the compiler decide (no pragma unroll)
+  for (int d = 0; d < BASIS_DIM; d++) {
+    // read A (BASIS_P x BASIS_Q) using all threads
+    CeedScalar rA[BASIS_P] = {MAGMA_D_ZERO};
+    __syncthreads();
+    read_A_trans_g2r_1D_nosync<CeedScalar, BASIS_Q, BASIS_P, BASIS_NB_GRAD_N>(tx, ty, dA, ldda, sA, slda, rA);
+
+    // init rC
+    CeedScalar rC[BASIS_NB_GRAD_N] = {MAGMA_D_ZERO};
+    if (id < nblocks) {
+      mul_rAsBrC_1D_nosync<CeedScalar, BASIS_Q, BASIS_P, BASIS_NB_GRAD_N>(tx, rA, sB, sldb, rC);
+    }
+
+    // write C
+    if (id < nblocks) {
+      write_C_r2g_1D_nosync<CeedScalar, BASIS_Q, BASIS_P, BASIS_NB_GRAD_N>(tx, myn, rC, dC, lddc);
+    }
+
+    dA += BASIS_Q * BASIS_P;
+    dC += BASIS_Q * n;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_grad_nontensor_t(int n, CeedScalar const *dA, int ldda, CeedScalar const *dB, int lddb, CeedScalar *dC, int lddc) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
+
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int id      = blockIdx.x * blockDim.y + ty;
+  const int nblocks = MAGMA_CEILDIV(n, BASIS_NB_GRAD_T);
+  const int myn     = min(BASIS_NB_GRAD_T, n - id * BASIS_NB_GRAD_T);
+
+  // terminate threads with no work
+  if (id >= nblocks) return;
+
+  dB += id * BASIS_NB_GRAD_T * lddb;
+  dC += id * BASIS_NB_GRAD_T * lddc;
+
+  // A is BASIS_P x BASIS_Q
+  const int   sldb = BASIS_Q;
+  CeedScalar *sB   = (CeedScalar *)shared_data;
+  sB += ty * sldb * BASIS_NB_GRAD_T;
+
+  // init rA, rC
+  CeedScalar rA[BASIS_Q]         = {MAGMA_D_ZERO};
+  CeedScalar rC[BASIS_NB_GRAD_T] = {MAGMA_D_ZERO};
+
+  // unrolling this loop yields dramatic performance drop using hipcc, let the compiler decide (no pragma unroll)
+  for (int d = 0; d < BASIS_DIM; d++) {
+    // read A
+    read_A_notrans_g2r_1D_nosync<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_GRAD_T>(tx, dA, ldda, NULL, 0, rA);
+
+    // read B
+    __syncthreads();
+    read_B_g2s_1D_nosync<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_GRAD_T>(tx, myn, dB, lddb, sB, sldb);
+    __syncthreads();
+
+    addmul_rAsBrC_1D_nosync<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_GRAD_T>(tx, rA, sB, sldb, rC);
+
+    dA += BASIS_P * BASIS_Q;
+    dB += BASIS_Q * n;
+  }
+
+  // write C
+  write_C_r2g_1D_nosync<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_GRAD_T>(tx, myn, rC, dC, lddc);
+}
+
+#endif  // CEED_MAGMA_GRAD_NONTENSOR_H
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-1d.h b/include/ceed/jit-source/magma/magma-basis-interp-1d.h
new file mode 100644
index 0000000000..074efd94b6
--- /dev/null
+++ b/include/ceed/jit-source/magma/magma-basis-interp-1d.h
@@ -0,0 +1,138 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for MAGMA tensor basis interpolation in 1D
+#ifndef CEED_MAGMA_BASIS_INTERP_1D_H
+#define CEED_MAGMA_BASIS_INTERP_1D_H
+
+#include "magma-common-tensor.h"
+
+// macros to abstract access of shared memory and reg. file
+#define sT(i, j) sT[(j)*P + (i)]
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// interp basis action (1D)
+template <typename T, int DIM, int NUM_COMP, int P, int Q>
+static __device__ __inline__ void magma_interp_1d_device(const T *sT, magma_trans_t transT, T *sU[NUM_COMP], T *sV[NUM_COMP], const int tx) {
+  // Assumptions
+  // 1. 1D threads of size max(P,Q)
+  // 2. sU[i] is 1xP: in shared memory
+  // 3. sV[i] is 1xQ: in shared memory
+  // 4. P_roduct per component is one row (1xP) times T matrix (PxQ) => one row (1xQ)
+  // 5. Each thread computes one entry in sV[i]
+  // 6. Must sync before and after call
+  // 7. Note that the layout for U and V is different from 2D/3D problem
+
+  T rv;
+  if (tx < Q) {
+    for (int comp = 0; comp < NUM_COMP; comp++) {
+      rv = (transT == MagmaTrans) ? sV[comp][tx] : 0.0;
+      for (int i = 0; i < P; i++) {
+        rv += sU[comp][i] * sT(i, tx);  // sT[tx * P + i];
+      }
+      sV[comp][tx] = rv;
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_interpn_1d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
+                                 const int cstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int     tx      = threadIdx.x;
+  const int     ty      = threadIdx.y;
+  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
+  magma_trans_t transT  = MagmaNoTrans;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar *sU[BASIS_NUM_COMP];
+  CeedScalar *sV[BASIS_NUM_COMP];
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sT = (CeedScalar *)shared_data;
+  CeedScalar *sW = sT + BASIS_P * BASIS_Q;
+  sU[0]          = sW + ty * BASIS_NUM_COMP * (BASIS_P + BASIS_Q);
+  sV[0]          = sU[0] + (BASIS_NUM_COMP * 1 * BASIS_P);
+  for (int comp = 1; comp < BASIS_NUM_COMP; comp++) {
+    sU[comp] = sU[comp - 1] + (1 * BASIS_P);
+    sV[comp] = sV[comp - 1] + (1 * BASIS_Q);
+  }
+
+  // read T
+  if (ty == 0) {
+    dread_T_gm2sm<BASIS_P, BASIS_Q>(tx, transT, dT, sT);
+  }
+
+  // read U
+  read_1d<CeedScalar, BASIS_P, BASIS_NUM_COMP>(dU, cstrdU, sU, tx);
+
+  __syncthreads();
+  magma_interp_1d_device<CeedScalar, BASIS_DIM, BASIS_NUM_COMP, BASIS_P, BASIS_Q>(sT, transT, sU, sV, tx);
+  __syncthreads();
+
+  // write V
+  write_1d<CeedScalar, BASIS_Q, BASIS_NUM_COMP>(sV, dV, cstrdV, tx);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_interpt_1d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
+                                 const int cstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int     tx      = threadIdx.x;
+  const int     ty      = threadIdx.y;
+  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
+  magma_trans_t transT  = MagmaTrans;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar *sU[BASIS_NUM_COMP];
+  CeedScalar *sV[BASIS_NUM_COMP];
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sT = (CeedScalar *)shared_data;
+  CeedScalar *sW = sT + BASIS_Q * BASIS_P;
+  sU[0]          = sW + ty * BASIS_NUM_COMP * (BASIS_Q + BASIS_P);
+  sV[0]          = sU[0] + (BASIS_NUM_COMP * 1 * BASIS_Q);
+  for (int comp = 1; comp < BASIS_NUM_COMP; comp++) {
+    sU[comp] = sU[comp - 1] + (1 * BASIS_Q);
+    sV[comp] = sV[comp - 1] + (1 * BASIS_P);
+  }
+
+  // read T
+  if (ty == 0) {
+    dread_T_gm2sm<BASIS_Q, BASIS_P>(tx, transT, dT, sT);
+  }
+
+  // read U
+  read_1d<CeedScalar, BASIS_Q, BASIS_NUM_COMP>(dU, cstrdU, sU, tx);
+
+  // read V
+  read_1d<CeedScalar, BASIS_P, BASIS_NUM_COMP>(dV, cstrdV, sV, tx);
+
+  __syncthreads();
+  magma_interp_1d_device<CeedScalar, BASIS_DIM, BASIS_NUM_COMP, BASIS_Q, BASIS_P>(sT, transT, sU, sV, tx);
+  __syncthreads();
+
+  // write V
+  write_1d<CeedScalar, BASIS_P, BASIS_NUM_COMP>(sV, dV, cstrdV, tx);
+}
+
+#endif  // CEED_MAGMA_BASIS_INTERP_1D_H
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-2d.h b/include/ceed/jit-source/magma/magma-basis-interp-2d.h
new file mode 100644
index 0000000000..bb3475df51
--- /dev/null
+++ b/include/ceed/jit-source/magma/magma-basis-interp-2d.h
@@ -0,0 +1,155 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for MAGMA tensor basis interpolation in 1D
+#ifndef CEED_MAGMA_BASIS_INTERP_2D_H
+#define CEED_MAGMA_BASIS_INTERP_2D_H
+
+#include "magma-common-tensor.h"
+
+// macros to abstract access of shared memory and reg. file
+#define sT(i, j) sT[(j)*P + (i)]
+#define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)]
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// interp basis action (2D)
+template <typename T, int DIM_U, int DIM_V, int NUM_COMP, int P, int Q, int rU_SIZE, int rV_SIZE>
+static __device__ __inline__ void magma_interp_2d_device(const T *sT, magma_trans_t transT, T rU[DIM_U][NUM_COMP][rU_SIZE],
+                                                         T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx, T rTmp, T *swork) {
+  // Assumptions
+  // 1. 1D threads of size max(P,Q)
+  // 2. input:  rU[DIM_U x NUM_COMP x rU_SIZE] in registers (per thread)
+  // 3. output: rV[DIM_V x NUM_COMP x rV_SIZE] in registers (per thread)
+  // 4. Two products per component
+  //  4.1 Batch P of (1xP) matrices times (PxQ) matrix => Batch P of (1xQ) matrices
+  //  4.2 Batch 1 of (QxP) matrix   times (PxQ) matrix => (QxQ) matrix
+  // 5. Each thread computes one row of the output of each product
+  // 6. Sync is recommended before and after the call
+
+  for (int comp = 0; comp < NUM_COMP; comp++) {
+    // 1st product -- Batch P of (1xP) matrices [reg] x (PxQ) [shmem] => Batch P of (1xQ) matrices
+    // the batch output P x (1xQ) is written on the fly to shmem
+    if (tx < P) {
+      const int batchid = tx;
+      const int sld     = 1;
+      T        *sTmp    = swork + batchid * (1 * Q);
+      for (int j = 0; j < Q; j++) {
+        rTmp = 0.0;
+        for (int i = 0; i < P; i++) {
+          rTmp += rU[0][comp][i] * sT(i, j);
+        }
+        sTmp(0, j, sld) = rTmp;
+      }
+    }  // end of: if (tx < P)
+    __syncthreads();
+
+    // 2nd product -- Batch 1 of a (QxP) matrix [shmem] x (PxQ) [shmem] => (QxQ) matrix [reg]
+    if (tx < Q) {
+      const int batchid = 0;
+      const int sld     = Q;
+      T        *sTmp    = swork + batchid * (Q * P);
+      for (int j = 0; j < Q; j++) {
+        rTmp = 0.0;
+        for (int i = 0; i < P; i++) {
+          rTmp += sTmp(tx, i, sld) * sT(i, j);
+        }
+        rV[0][comp][j] += rTmp;
+      }
+    }
+    __syncthreads();
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__
+    void magma_interpn_2d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
+                                 const int cstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int     tx      = threadIdx.x;
+  const int     ty      = threadIdx.y;
+  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
+  magma_trans_t transT  = MagmaNoTrans;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar rU[1][BASIS_NUM_COMP][BASIS_P] = {0.0};  // for a non-fused operator BASIS_DIM is always 1
+  CeedScalar rV[1][BASIS_NUM_COMP][BASIS_Q] = {0.0};  // for a non-fused operator BASIS_DIM is always 1
+  CeedScalar rTmp                           = 0.0;
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sT   = (CeedScalar *)shared_data;
+  CeedScalar *sTmp = sT + BASIS_P * BASIS_Q;
+  sTmp += ty * (BASIS_P * BASIS_MAX_P_Q);
+
+  // read T
+  if (ty == 0) {
+    dread_T_gm2sm<BASIS_P, BASIS_Q>(tx, transT, dT, sT);
+  }
+
+  // read U -- there is a sync at the end of this function
+  readU_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dU, cstrdU, rU, sTmp, tx);
+
+  // no sync needed here -- readU_2d already syncs at the end
+  magma_interp_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q>(sT, transT, rU, rV, tx, rTmp, sTmp);
+  __syncthreads();
+
+  // write V
+  writeV_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV, cstrdV, rV, tx);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__
+    void magma_interpt_2d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
+                                 const int cstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int     tx      = threadIdx.x;
+  const int     ty      = threadIdx.y;
+  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
+  magma_trans_t transT  = MagmaTrans;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0};  // for a non-fused operator BASIS_DIM is always 1
+  CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0};  // for a non-fused operator BASIS_DIM is always 1
+  CeedScalar rTmp                           = 0.0;
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sT   = (CeedScalar *)shared_data;
+  CeedScalar *sTmp = sT + BASIS_Q * BASIS_P;
+  sTmp += ty * (BASIS_Q * BASIS_MAX_P_Q);
+
+  // read T
+  if (ty == 0) {
+    dread_T_gm2sm<BASIS_Q, BASIS_P>(tx, transT, dT, sT);
+  }
+
+  // read V
+  readV_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
+
+  // read U -- there is a sync at the end of this function
+  readU_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU, cstrdU, rU, sTmp, tx);
+
+  // no sync needed here -- readU_2d already syncs at the end
+  magma_interp_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P>(sT, transT, rU, rV, tx, rTmp, sTmp);
+  __syncthreads();
+
+  // write V
+  writeV_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
+}
+
+#endif  // CEED_MAGMA_BASIS_INTERP_2D_H
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-3d.h b/include/ceed/jit-source/magma/magma-basis-interp-3d.h
new file mode 100644
index 0000000000..8f2fd3985e
--- /dev/null
+++ b/include/ceed/jit-source/magma/magma-basis-interp-3d.h
@@ -0,0 +1,183 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for MAGMA tensor basis interpolation in 3D
+#ifndef CEED_MAGMA_BASIS_INTERP_3D_H
+#define CEED_MAGMA_BASIS_INTERP_3D_H
+
+#include "magma-common-tensor.h"
+
+// macros to abstract access of shared memory and reg. file
+#define sT(i, j) sT[(j)*P + (i)]
+#define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)]
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// interp basis action (3D)
+template <typename T, int DIM_U, int DIM_V, int NUM_COMP, int P, int Q, int rU_SIZE, int rV_SIZE>
+static __device__ __inline__ void magma_interp_3d_device(const T *sT, magma_trans_t transT, T rU[DIM_U][NUM_COMP][rU_SIZE],
+                                                         T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx, T rTmp[Q], T *swork) {
+  // Assumptions
+  // 1. 1D threads of size max(P,Q)^2
+  // 2. input:  rU[DIM_U x NUM_COMP x rU_SIZE] in registers (per thread)
+  // 3. output: rV[DIM_V x NUM_COMP x rV_SIZE] in registers (per thread)
+  // 4. Three products per component
+  //  4.1 Batch P^2 of (1xP) matrices times (PxQ) matrix => Batch P^2 of (1xQ) matrices
+  //  4.2 Batch P   of (QxP) matrices times (PxQ) matrix => Batch P   of (QxQ) matrices
+  //  4.3 Batch 1   of (Q^2xP_) matrix times (PxQ) matrix => (Q^2xQ_) matrix
+  // 5. Each thread computes one row of the output of each product
+  // 6. Sync is recommended before and after the call
+
+  for (int comp = 0; comp < NUM_COMP; comp++) {
+    // Batch P^2 of (1xP) matrices [reg] times (PxQ) matrix [shmem] => Batch P^2 of (1xQ) matrices [shmem]
+    if (tx < (P * P)) {
+      const int batchid = tx;
+      const int sld     = 1;
+      T        *sTmp    = swork + batchid * (1 * Q);
+      for (int j = 0; j < Q; j++) {
+        rTmp[0] = 0.0;
+        for (int i = 0; i < P; i++) {
+          rTmp[0] += rU[0][comp][i] * sT(i, j);
+        }
+        sTmp(0, j, sld) = rTmp[0];
+      }
+    }  // end of: if (tx < P*P)
+    __syncthreads();
+
+    // Batch P of (QxP) matrices [shmem] times (PxQ) matrix [shmem] => Batch P of (QxQ) matrices [reg]
+    if (tx < (P * Q)) {
+      const int batchid = tx / Q;
+      const int tx_     = tx % Q;
+      const int sld     = Q;
+      T        *sTmp    = swork + batchid * (Q * P);  // sTmp is input
+      for (int j = 0; j < Q; j++) {
+        rTmp[j] = 0.0;
+        for (int i = 0; i < P; i++) {
+          rTmp[j] += sTmp(tx_, i, sld) * sT(i, j);
+        }
+      }
+    }
+    __syncthreads();
+
+    // write rTmp[] into shmem as batch P of QxQ matrices
+    if (tx < (P * Q)) {
+      const int batchid = tx / Q;
+      const int tx_     = tx % Q;
+      const int sld     = Q;
+      T        *sTmp    = swork + batchid * (Q * Q);
+      for (int j = 0; j < Q; j++) {
+        sTmp(tx_, j, sld) = rTmp[j];
+      }
+    }
+    __syncthreads();
+
+    // Batch 1 of (Q^2xP_) matrices [shmem] times (PxQ) matrix [shmem] => Batch 1 of (Q^2xQ_) matrices [reg]
+    if (tx < (Q * Q)) {
+      // No need to declare batchid = (tx  / Q^2) = always zero
+      // No need to declare tx_     = (tx_ % Q^2) = always tx
+      const int sld  = Q * Q;
+      T        *sTmp = swork;
+      for (int j = 0; j < Q; j++) {
+        rTmp[0] = 0.0;
+        for (int i = 0; i < P; i++) {
+          rTmp[0] += sTmp(tx, i, sld) * sT(i, j);
+        }
+        rV[0][comp][j] += rTmp[0];
+      }
+    }
+    __syncthreads();
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__
+    void magma_interpn_3d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
+                                 const int cstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int     tx      = threadIdx.x;
+  const int     ty      = threadIdx.y;
+  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
+  magma_trans_t transT  = MagmaNoTrans;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar rU[1][BASIS_NUM_COMP][BASIS_P] = {0.0};  // for a non-fused operator BASIS_DIM is always 1
+  CeedScalar rV[1][BASIS_NUM_COMP][BASIS_Q] = {0.0};  // for a non-fused operator BASIS_DIM is always 1
+  CeedScalar rTmp[BASIS_Q]                  = {0.0};
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sT   = (CeedScalar *)shared_data;
+  CeedScalar *sTmp = sT + BASIS_P * BASIS_Q;
+  sTmp += ty * (max(BASIS_P * BASIS_P * BASIS_MAX_P_Q, BASIS_P * BASIS_Q * BASIS_Q));
+
+  // read T
+  if (ty == 0) {
+    dread_T_gm2sm<BASIS_P, BASIS_Q>(tx, transT, dT, sT);
+  }
+
+  // read U (idim = 0 for dU, i_DIM = 0 for rU, u_dimstride is always 0)
+  readU_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dU, cstrdU, rU, sTmp, tx);
+  // there is a sync at the end of this function
+
+  magma_interp_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q>(sT, transT, rU, rV, tx, rTmp, sTmp);
+  __syncthreads();
+
+  // write V
+  writeV_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV, cstrdV, rV, tx);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__
+    void magma_interpt_3d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
+                                 const int cstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int     tx      = threadIdx.x;
+  const int     ty      = threadIdx.y;
+  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
+  magma_trans_t transT  = MagmaTrans;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0};  // for a non-fused operator BASIS_DIM is always 1
+  CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0};  // for a non-fused operator BASIS_DIM is always 1
+  CeedScalar rTmp[BASIS_P]                  = {0.0};
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sT   = (CeedScalar *)shared_data;
+  CeedScalar *sTmp = sT + BASIS_Q * BASIS_P;
+  sTmp += ty * (max(BASIS_Q * BASIS_Q * BASIS_MAX_P_Q, BASIS_Q * BASIS_P * BASIS_P));
+
+  // read T
+  if (ty == 0) {
+    dread_T_gm2sm<BASIS_Q, BASIS_P>(tx, transT, dT, sT);
+  }
+
+  // read V
+  readV_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
+
+  // read U (idim = 0 for dU, i_DIM = 0 for rU, u_dimstride is always 0)
+  readU_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU, cstrdU, rU, sTmp, tx);
+  // there is a sync at the end of this function
+
+  magma_interp_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P>(sT, transT, rU, rV, tx, rTmp, sTmp);
+  __syncthreads();
+
+  // write V
+  writeV_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
+}
+
+#endif  // CEED_MAGMA_BASIS_INTERP_3D_H
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-nontensor.h b/include/ceed/jit-source/magma/magma-basis-interp-nontensor.h
new file mode 100644
index 0000000000..956d69392a
--- /dev/null
+++ b/include/ceed/jit-source/magma/magma-basis-interp-nontensor.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for MAGMA non-tensor basis interpolation
+#ifndef CEED_MAGMA_INTERP_NONTENSOR_H
+#define CEED_MAGMA_INTERP_NONTENSOR_H
+
+#include "magma-common-nontensor.h"
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_interp_nontensor_n(int n, CeedScalar const *dA, int ldda, CeedScalar const *dB, int lddb, CeedScalar *dC, int lddc) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
+
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int id      = blockIdx.x * blockDim.y + ty;
+  const int nblocks = MAGMA_CEILDIV(n, BASIS_NB_INTERP_N);
+  const int myn     = min(BASIS_NB_INTERP_N, n - id * BASIS_NB_INTERP_N);
+
+  dB += id * BASIS_NB_INTERP_N * lddb;
+  dC += id * BASIS_NB_INTERP_N * lddc;
+
+  // A is BASIS_P x BASIS_Q
+  const int   slda = BASIS_P;
+  const int   sldb = BASIS_P;
+  CeedScalar *sA   = (CeedScalar *)shared_data;
+  CeedScalar *sB   = sA;
+  sB += ty * sldb * BASIS_NB_INTERP_N;
+
+  // read A using all threads
+  CeedScalar rA[BASIS_P] = {MAGMA_D_ZERO};
+  read_A_trans_g2r_1D_nosync<CeedScalar, BASIS_Q, BASIS_P, BASIS_NB_INTERP_N>(tx, ty, dA, ldda, sA, slda, rA);
+  __syncthreads();
+
+  // terminate threads with no work
+  if (id >= nblocks) return;
+
+  // init rC
+  CeedScalar rC[BASIS_NB_INTERP_N] = {MAGMA_D_ZERO};
+
+  // read B
+  read_B_g2s_1D_nosync<CeedScalar, BASIS_Q, BASIS_P, BASIS_NB_INTERP_N>(tx, myn, dB, lddb, sB, sldb);
+  __syncthreads();
+
+  mul_rAsBrC_1D_nosync<CeedScalar, BASIS_Q, BASIS_P, BASIS_NB_INTERP_N>(tx, rA, sB, sldb, rC);
+
+  // write C
+  write_C_r2g_1D_nosync<CeedScalar, BASIS_Q, BASIS_P, BASIS_NB_INTERP_N>(tx, myn, rC, dC, lddc);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_interp_nontensor_t(int n, CeedScalar const *dA, int ldda, CeedScalar const *dB, int lddb, CeedScalar *dC, int lddc) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
+
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int id      = blockIdx.x * blockDim.y + ty;
+  const int nblocks = MAGMA_CEILDIV(n, BASIS_NB_INTERP_T);
+  const int myn     = min(BASIS_NB_INTERP_T, n - id * BASIS_NB_INTERP_T);
+
+  // terminate threads with no work
+  if (id >= nblocks) return;
+
+  dB += id * BASIS_NB_INTERP_T * lddb;
+  dC += id * BASIS_NB_INTERP_T * lddc;
+
+  // A is BASIS_P x BASIS_Q
+  const int   sldb = BASIS_Q;
+  CeedScalar *sB   = (CeedScalar *)shared_data;
+  sB += ty * sldb * BASIS_NB_INTERP_T;
+
+  // init rC
+  CeedScalar rC[BASIS_NB_INTERP_T] = {MAGMA_D_ZERO};
+
+  // read A
+  CeedScalar rA[BASIS_Q] = {MAGMA_D_ZERO};
+  read_A_notrans_g2r_1D_nosync<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(tx, dA, ldda, NULL, 0, rA);
+
+  // read B
+  read_B_g2s_1D_nosync<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(tx, myn, dB, lddb, sB, sldb);
+  __syncthreads();
+
+  mul_rAsBrC_1D_nosync<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(tx, rA, sB, sldb, rC);
+
+  // write C
+  write_C_r2g_1D_nosync<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(tx, myn, rC, dC, lddc);
+}
+
+#endif  // CEED_MAGMA_INTERP_NONTENSOR_H
diff --git a/include/ceed/jit-source/magma/weight-1d.h b/include/ceed/jit-source/magma/magma-basis-weight-1d.h
similarity index 61%
rename from include/ceed/jit-source/magma/weight-1d.h
rename to include/ceed/jit-source/magma/magma-basis-weight-1d.h
index e4a7abe18c..9f77dd4aca 100644
--- a/include/ceed/jit-source/magma/weight-1d.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-1d.h
@@ -5,20 +5,27 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
+/// @file
+/// Internal header for MAGMA tensor basis weight in 1D
+#ifndef CEED_MAGMA_BASIS_WEIGHT_1D_H
+#define CEED_MAGMA_BASIS_WEIGHT_1D_H
+
+#include "magma-common-tensor.h"
+
 //////////////////////////////////////////////////////////////////////////////////////////
 // weight basis action -- 1D
-template <typename T, int Q_>
-__device__ __inline__ void magma_weight_1d_device(const T *sTweight, T *sV, const int tx) {
+template <typename T, int Q>
+static __device__ __inline__ void magma_weight_1d_device(const T *sTweight, T *sV, const int tx) {
   // Assumptions
-  // 1. 1D thread configuration of size Q_
-  // 2. The output sV is in shared memory -- size 1xQ_
-  if (tx < Q_) {
+  // 1. 1D thread configuration of size Q
+  // 2. The output sV is in shared memory -- size 1xQ
+  if (tx < Q) {
     sV[tx] = sTweight[tx];
   }
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////
-extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(Q, MAGMA_MAXTHREADS_1D)) __global__
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
     void magma_weight_1d_kernel(const CeedScalar *dqweight1d, CeedScalar *dV, const int v_stride, const int nelem) {
   MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
 
@@ -33,18 +40,20 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(Q, MAGMA_MAXTHREADS_1D)) __globa
 
   // shared memory pointers
   CeedScalar *sTweight = (CeedScalar *)shared_data;
-  CeedScalar *sV       = sTweight + Q;
-  sV += ty * Q;
+  CeedScalar *sV       = sTweight + BASIS_Q;
+  sV += ty * BASIS_Q;
 
   // read dqweight_1d
-  if (ty == 0 && tx < Q) {
+  if (ty == 0 && tx < BASIS_Q) {
     sTweight[tx] = dqweight1d[tx];
   }
 
   __syncthreads();
-  magma_weight_1d_device<CeedScalar, Q>(sTweight, sV, tx);
+  magma_weight_1d_device<CeedScalar, BASIS_Q>(sTweight, sV, tx);
   __syncthreads();
 
   // write V
   dV[tx] = sV[tx];
 }
+
+#endif  // CEED_MAGMA_BASIS_WEIGHT_1D_H
diff --git a/include/ceed/jit-source/magma/weight-2d.h b/include/ceed/jit-source/magma/magma-basis-weight-2d.h
similarity index 59%
rename from include/ceed/jit-source/magma/weight-2d.h
rename to include/ceed/jit-source/magma/magma-basis-weight-2d.h
index 7ad62d7315..721b50f953 100644
--- a/include/ceed/jit-source/magma/weight-2d.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-2d.h
@@ -5,28 +5,35 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
+/// @file
+/// Internal header for MAGMA tensor basis weight in 2D
+#ifndef CEED_MAGMA_BASIS_WEIGHT_2D_H
+#define CEED_MAGMA_BASIS_WEIGHT_2D_H
+
+#include "magma-common-tensor.h"
+
 //////////////////////////////////////////////////////////////////////////////////////////
 // weight basis action -- 2D
-template <typename T, int DIM_, int NCOMP_, int Q_, int iDIM, int iCOMP>
-__device__ __inline__ void magma_weight_2d_device(const T *sTweight, T rV[DIM_][NCOMP_][Q_], const int tx) {
+template <typename T, int DIM, int NUM_COMP, int Q, int i_DIM, int i_COMP>
+static __device__ __inline__ void magma_weight_2d_device(const T *sTweight, T rV[DIM][NUM_COMP][Q], const int tx) {
   // Assumptions
-  // 1. 1D thread configuration of size Q_
+  // 1. 1D thread configuration of size Q
   // 2. rV[][][] matches the storage used in other actions (interp, grad, ... etc)
-  // 3. iDIM and iCOMP specify which indexes to use in rV,
-  //    since the output per thread is a register array of size Q_
+  // 3. i_DIM and i_COMP specify which indexes to use in rV,
+  //    since the output per thread is a register array of size Q
   // 4. Sync is recommended after the call (to make sure sTweight can be overwritten)
 
-  if (tx < Q_) {
+  if (tx < Q) {
     // x sTweight[j]  for first update
     // x sTweight[tx] for second update
-    for (int j = 0; j < Q_; j++) {
-      rV[iDIM][iCOMP][j] = sTweight[j] * sTweight[tx];
+    for (int j = 0; j < Q; j++) {
+      rV[i_DIM][i_COMP][j] = sTweight[j] * sTweight[tx];
     }
   }
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////
-extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(Q, MAGMA_MAXTHREADS_2D)) __global__
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_2D)) __global__
     void magma_weight_2d_kernel(const CeedScalar *dqweight1d, CeedScalar *dV, const int v_stride, const int nelem) {
   MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
 
@@ -36,7 +43,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(Q, MAGMA_MAXTHREADS_2D)) __globa
 
   if (elem_id >= nelem) return;
 
-  CeedScalar rV[1][1][Q];  // allocate with DIM=NCOMP=1, but sizes may differ for a fused operator
+  CeedScalar rV[1][1][BASIS_Q];  // allocate with BASIS_DIM=BASIS_NUM_COMP=1, but sizes may differ for a fused operator
   // global memory pointers
   dV += elem_id * v_stride;
 
@@ -44,17 +51,19 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(Q, MAGMA_MAXTHREADS_2D)) __globa
   CeedScalar *sTweight = (CeedScalar *)shared_data;
 
   // read dqweight_1d
-  if (ty == 0 && tx < Q) {
+  if (ty == 0 && tx < BASIS_Q) {
     sTweight[tx] = dqweight1d[tx];
   }
 
   __syncthreads();
-  magma_weight_2d_device<CeedScalar, 1, 1, Q, 0, 0>(sTweight, rV, tx);
+  magma_weight_2d_device<CeedScalar, 1, 1, BASIS_Q, 0, 0>(sTweight, rV, tx);
 
   // write V
-  if (tx < Q) {
-    for (int j = 0; j < Q; j++) {
-      dV[j * Q + tx] = rV[0][0][j];
+  if (tx < BASIS_Q) {
+    for (int j = 0; j < BASIS_Q; j++) {
+      dV[j * BASIS_Q + tx] = rV[0][0][j];
     }
   }
 }
+
+#endif  // CEED_MAGMA_BASIS_WEIGHT_2D_H
diff --git a/include/ceed/jit-source/magma/weight-3d.h b/include/ceed/jit-source/magma/magma-basis-weight-3d.h
similarity index 55%
rename from include/ceed/jit-source/magma/weight-3d.h
rename to include/ceed/jit-source/magma/magma-basis-weight-3d.h
index 07fc2286ca..835bca44cd 100644
--- a/include/ceed/jit-source/magma/weight-3d.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-3d.h
@@ -5,29 +5,36 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
+/// @file
+/// Internal header for MAGMA tensor basis weight in 3D
+#ifndef CEED_MAGMA_BASIS_WEIGHT_3D_H
+#define CEED_MAGMA_BASIS_WEIGHT_3D_H
+
+#include "magma-common-tensor.h"
+
 //////////////////////////////////////////////////////////////////////////////////////////
 // weight basis action -- 3D
-template <typename T, int DIM_, int NCOMP_, int Q_, int iDIM, int iCOMP>
-__device__ __inline__ void magma_weight_3d_device(const T *sTweight, T rV[DIM_][NCOMP_][Q_], const int tx) {
+template <typename T, int DIM, int NUM_COMP, int Q, int i_DIM, int i_COMP>
+static __device__ __inline__ void magma_weight_3d_device(const T *sTweight, T rV[DIM][NUM_COMP][Q], const int tx) {
   // Assumptions
-  // 1. 1D thread configuration of size Q_^2
+  // 1. 1D thread configuration of size Q^2
   // 2. rV[][][] matches the storage used in other actions (interp, grad, ... etc)
-  // 3. iDIM and iCOMP specify which indexes to use in rV,
-  //    since the output per thread is a register array of size Q_
+  // 3. i_DIM and i_COMP specify which indexes to use in rV,
+  //    since the output per thread is a register array of size Q
   // 4. Sync is recommended after the call (to make sure sTweight can be overwritten)
 
-  if (tx < (Q_ * Q_)) {
+  if (tx < Q * Q) {
     // x sTweight[j]    for first update
-    // x sTweight[tx%Q_] for second update
-    // x sTweight[tx/Q_] for third update
-    for (int j = 0; j < Q_; j++) {
-      rV[iDIM][iCOMP][j] = sTweight[j] * sTweight[tx % Q_] * sTweight[tx / Q_];
+    // x sTweight[tx%Q] for second update
+    // x sTweight[tx/Q] for third update
+    for (int j = 0; j < Q; j++) {
+      rV[i_DIM][i_COMP][j] = sTweight[j] * sTweight[tx % Q] * sTweight[tx / Q];
     }
   }
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////
-extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(Q *Q, MAGMA_MAXTHREADS_3D)) __global__
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q *BASIS_Q, MAGMA_MAXTHREADS_3D)) __global__
     void magma_weight_3d_kernel(const CeedScalar *dqweight1d, CeedScalar *dV, const int v_stride, const int nelem) {
   MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
 
@@ -37,7 +44,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(Q *Q, MAGMA_MAXTHREADS_3D)) __gl
 
   if (elem_id >= nelem) return;
 
-  CeedScalar rV[1][1][Q];  // allocate with DIM=NCOMP=1, but sizes may differ for a fused operator
+  CeedScalar rV[1][1][BASIS_Q];  // allocate with BASIS_DIM=BASIS_NUM_COMP=1, but sizes may differ for a fused operator
   // global memory pointers
   dV += elem_id * v_stride;
 
@@ -45,17 +52,19 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(Q *Q, MAGMA_MAXTHREADS_3D)) __gl
   CeedScalar *sTweight = (CeedScalar *)shared_data;
 
   // read dqweight_1d
-  if (tx < Q) {
+  if (tx < BASIS_Q) {
     sTweight[tx] = dqweight1d[tx];
   }
   __syncthreads();
 
-  magma_weight_3d_device<CeedScalar, 1, 1, Q, 0, 0>(sTweight, rV, tx);
+  magma_weight_3d_device<CeedScalar, 1, 1, BASIS_Q, 0, 0>(sTweight, rV, tx);
 
   // write V
-  if (tx < (Q * Q)) {
-    for (int j = 0; j < Q; j++) {
-      dV[j * (Q * Q) + tx] = rV[0][0][j];
+  if (tx < (BASIS_Q * BASIS_Q)) {
+    for (int j = 0; j < BASIS_Q; j++) {
+      dV[j * (BASIS_Q * BASIS_Q) + tx] = rV[0][0][j];
     }
   }
 }
+
+#endif  // CEED_MAGMA_BASIS_WEIGHT_3D_H
diff --git a/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h b/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h
new file mode 100644
index 0000000000..f6b51380cf
--- /dev/null
+++ b/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for MAGMA non-tensor basis weight
+#ifndef CEED_MAGMA_BASIS_WEIGHT_NONTENSOR_H
+#define CEED_MAGMA_BASIS_WEIGHT_NONTENSOR_H
+
+#include "magma-common-nontensor.h"
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_weight_nontensor(int n, const CeedScalar *dqweight, CeedScalar *dV, int lddv) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
+
+  const int tx = threadIdx.x;
+  const int ty = threadIdx.y;
+  const int id = blockIdx.x * blockDim.y + ty;
+
+  // terminate threads with no work
+  if (id >= n) return;
+
+  dV += id * lddv;
+
+  // shared memory pointers
+  CeedScalar *sqweight = (CeedScalar *)shared_data;
+  CeedScalar *sV       = sqweight + BASIS_Q;
+  sV += ty * BASIS_Q;
+
+  // read qweight
+  if (ty == 0 && tx < BASIS_Q) {
+    sqweight[tx] = dqweight[tx];
+  }
+  __syncthreads();
+
+  if (tx < BASIS_Q) {
+    sV[tx] = sqweight[tx];
+  }
+
+  // write V
+  dV[tx] = sV[tx];
+}
+
+#endif  // CEED_MAGMA_BASIS_WEIGHT_NONTENSOR_H
diff --git a/include/ceed/jit-source/magma/magma_common_defs.h b/include/ceed/jit-source/magma/magma-common-defs.h
similarity index 65%
rename from include/ceed/jit-source/magma/magma_common_defs.h
rename to include/ceed/jit-source/magma/magma-common-defs.h
index a0cf1f93f6..24684be85e 100644
--- a/include/ceed/jit-source/magma/magma_common_defs.h
+++ b/include/ceed/jit-source/magma/magma-common-defs.h
@@ -5,6 +5,8 @@
 //
 // This file is part of CEED:  http://github.com/ceed
 
+/// @file
+/// Internal header for MAGMA backend common definitions
 #ifndef CEED_MAGMA_COMMON_DEFS_H
 #define CEED_MAGMA_COMMON_DEFS_H
 
@@ -23,4 +25,14 @@ typedef enum { MagmaNoTrans = 111, MagmaTrans = 112, MagmaConjTrans = 113, Magma
 #define MAGMA_ROUNDUP(A, B) MAGMA_CEILDIV((A), (B)) * (B)
 #define MAGMA_MAX(A, B) ((A) > (B) ? (A) : (B))
 
+#define MAGMA_MAXTHREADS_1D 128
+#define MAGMA_MAXTHREADS_2D 128
+#define MAGMA_MAXTHREADS_3D 64
+
+// Define macro for determining number of threads in y-direction for basis kernels
+#define MAGMA_BASIS_NTCOL(x, maxt) (((maxt) < (x)) ? 1 : ((maxt) / (x)))
+
+// Define macro for computing the total threads in a block for use with __launch_bounds__()
+#define MAGMA_BASIS_BOUNDS(x, maxt) (x * MAGMA_BASIS_NTCOL(x, maxt))
+
 #endif  // CEED_MAGMA_COMMON_DEFS_H
diff --git a/include/ceed/jit-source/magma/magma-common-nontensor.h b/include/ceed/jit-source/magma/magma-common-nontensor.h
new file mode 100644
index 0000000000..0e1bbb007b
--- /dev/null
+++ b/include/ceed/jit-source/magma/magma-common-nontensor.h
@@ -0,0 +1,146 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for MAGMA backend common non-tensor basis definitions
+#ifndef CEED_MAGMA_COMMON_NONTENSOR_H
+#define CEED_MAGMA_COMMON_NONTENSOR_H
+
+#include "magma-common-defs.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// read A (no-trans) from global to reg.
+// A is (P x Q)
+// 1D thread config. with (P x 1) threads
+// no sync at the end of the function
+template <typename T, int P, int Q, int NB>
+static __device__ __inline__ void read_A_notrans_g2r_1D_nosync(const int tx, const T *dA, int ldda, T *sA, int slda, T rA[Q]) {
+#pragma unroll
+  for (int j = 0; j < Q; j++) {
+    rA[j] = dA[j * ldda + tx];
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// read A (trans) from global to reg.
+// A is (P x Q)
+// 1D thread config. with (P x 1) threads
+// no sync at the end of the function
+template <typename T, int P, int Q, int NB>
+static __device__ __inline__ void read_A_trans_g2r_1D_nosync(const int tx, const int ty, const T *dA, int ldda, T *sA, int slda, T rA[Q]) {
+  const int nTH = MAGMA_BASIS_BOUNDS(P, MAGMA_MAXTHREADS_1D);
+  const int tid = ty * blockDim.x + tx;
+  int       i;
+
+#pragma unroll
+  for (i = 0; i < (Q * P) - nTH; i += nTH) {
+    sA[i + tid] = dA[i + tid];
+  }
+  if (tid < ((Q * P) - i)) {
+    sA[i + tid] = dA[i + tid];
+  }
+  __syncthreads();
+
+#pragma unroll
+  for (int j = 0; j < Q; j++) {
+    rA[j] = sA[tx * slda + j];
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// read B from global to shared
+// B is (Q x NB)
+// 1D thread config. with (P x 1) threads
+// no sync at the end of the function
+template <typename T, int P, int Q, int NB>
+static __device__ __inline__ void read_B_g2s_1D_nosync(const int tx, const int n, const T *dB, int lddb, T *sB, int sldb) {
+  if (n != NB) {
+    for (int i = 0; i < (Q * n) - P; i += P) {
+      sB[i + tx] = dB[i + tx];
+    }
+  } else {
+#pragma unroll
+    for (int i = 0; i < (Q * NB) - P; i += P) {
+      sB[i + tx] = dB[i + tx];
+    }
+  }
+
+  // cleanup for B
+  const int stride = MAGMA_ROUNDUP(Q * n - P, P);
+  if (tx < (Q * n) - stride) {
+    sB[stride + tx] = dB[stride + tx];
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// write C from reg. to global
+// C is (P x NB)
+// 1D thread config. with (P x 1) threads
+// no sync at the end of the function
+template <typename T, int P, int Q, int NB>
+static __device__ __inline__ void write_C_r2g_1D_nosync(const int tx, const int n, T rC[NB], T *dC, int lddc) {
+  if (n != NB) {
+#pragma unroll
+    for (int j = 0; j < NB; j++) {
+      if (j < n) {
+        dC[j * lddc + tx] = rC[j];
+      }
+    }
+  } else {
+#pragma unroll
+    for (int j = 0; j < NB; j++) {
+      dC[j * lddc + tx] = rC[j];
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// multiply C = A x B using 1D threads in P x 1 config
+// A (P x Q)  in reg., one row per thread
+// B (Q x NB) in shared memory
+// C in registers -- one row per thread
+// no sync at the end of the function
+template <typename T, int P, int Q, int NB>
+static __device__ __inline__ void mul_rAsBrC_1D_nosync(const int tx, T rA[Q], T *sB, int sldb, T rC[NB]) {
+  T rB[Q];
+#pragma unroll
+  for (int i = 0; i < NB; i++) {
+#pragma unroll
+    for (int k = 0; k < Q; k++) {
+      rB[k] = sB[i * sldb + k];
+    }
+    rC[i] = 0.0;
+#pragma unroll
+    for (int k = 0; k < Q; k++) {
+      rC[i] += rA[k] * rB[k];
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// multiply C += A x B using 1D threads in P x 1 config
+// A (P x Q)  in reg., one row per thread
+// B (Q x NB) in shared memory
+// C in registers -- one row per thread
+// no sync at the end of the function
+template <typename T, int P, int Q, int NB>
+static __device__ __inline__ void addmul_rAsBrC_1D_nosync(const int tx, T rA[Q], T *sB, int sldb, T rC[NB]) {
+  T rB[Q];
+#pragma unroll
+  for (int i = 0; i < NB; i++) {
+#pragma unroll
+    for (int k = 0; k < Q; k++) {
+      rB[k] = sB[i * sldb + k];
+    }
+#pragma unroll
+    for (int k = 0; k < Q; k++) {
+      rC[i] += rA[k] * rB[k];
+    }
+  }
+}
+
+#endif  // CEED_MAGMA_COMMON_NONTENSOR_H
diff --git a/include/ceed/jit-source/magma/magma-common-tensor.h b/include/ceed/jit-source/magma/magma-common-tensor.h
new file mode 100644
index 0000000000..1ca3f52758
--- /dev/null
+++ b/include/ceed/jit-source/magma/magma-common-tensor.h
@@ -0,0 +1,207 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for MAGMA backend common tensor basis definitions
+#ifndef CEED_MAGMA_COMMON_TENSOR_H
+#define CEED_MAGMA_COMMON_TENSOR_H
+
+#include "magma-common-defs.h"
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// read U or V of a 1D element into shared memory sU[][] or sV[][] --  for all components
+// the devptr is assumed to point directly to the element
+// must sync after call
+template <typename T, int LENGTH, int NUM_COMP>
+static __device__ __inline__ void read_1d(const T *devptr, const int compstride, T *sBuffer[NUM_COMP], const int tx) {
+  if (tx < LENGTH) {
+    for (int comp = 0; comp < NUM_COMP; comp++) {
+      sBuffer[comp][tx] = devptr[comp * compstride + tx];
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// write V of a 1D element into global memory from sV[][] --  for all components
+// the devptr is assumed to point directly to the element
+template <typename T, int LENGTH, int NUM_COMP>
+static __device__ __inline__ void write_1d(T *sBuffer[NUM_COMP], T *devptr, const int compstride, const int tx) {
+  if (tx < LENGTH) {
+    for (int comp = 0; comp < NUM_COMP; comp++) {
+      devptr[comp * compstride + tx] = sBuffer[comp][tx];
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// read U of a 2D element into registers rU[][][] --  for all components of a single dim
+// dU is assumed to be offset by elem-stride and dim-stride
+// register is assumed to be rU[DIM_U][NUM_COMP][rU_SIZE]
+// i_DIM specifies which dimension is being read into in rU
+// rU_SIZE can be different from P (e.g. MAXP_Q)
+// sTmp is a shared memory workspace of size P^2
+template <typename T, int P, int DIM_U, int NUM_COMP, int rU_SIZE, int i_DIM>
+static __device__ __inline__ void readU_2d(const T *dU, const int compstride, T rU[DIM_U][NUM_COMP][rU_SIZE], T *sTmp, const int tx) {
+  // read U as a batch P of (1 x P_) vectors
+  // vec 0  : [u0, u1, u2, ... u_(P-1)] -- contiguous in memory
+  // vec 1  : [u0, u1, u2, ... u_(P-1)] -- contiguous in memory
+  // ...
+  // vec P-1: [u0, u1, u2, ... u_(P-1)] -- contiguous in memory
+  // threads collaboratively read vec0 and then vec1 and so on
+  // but for the kernel, we want
+  // thread 0 to hold all of vec0 in registers, and
+  // thread 1 to hold all of vec1 in registers, and and so on
+  // so we need to transpose
+  for (int comp = 0; comp < NUM_COMP; comp++) {
+    // read from global memory into shared memory
+    if (tx < P) {
+      for (int i = 0; i < P; i++) {
+        sTmp[i * P + tx] = dU[comp * compstride + i * P + tx];
+      }
+    }
+    __syncthreads();
+
+    if (tx < P) {
+      for (int i = 0; i < P; i++) {
+        rU[i_DIM][comp][i] = sTmp[tx * P + i];
+      }
+    }
+    __syncthreads();
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// read V of a 2D element into registers rV[][][] --  for all components of a single dim
+// dV is assumed to be offset by elem-stride and dim-stride
+// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE]
+// i_DIM specifies which dimension is being read into in rV
+// rV_SIZE can be different from P (e.g. MAXP_Q)
+template <typename T, int Q, int DIM_V, int NUM_COMP, int rV_SIZE, int i_DIM>
+static __device__ __inline__ void readV_2d(const T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
+  if (tx < Q) {
+    for (int comp = 0; comp < NUM_COMP; comp++) {
+      for (int j = 0; j < Q; j++) {
+        rV[i_DIM][comp][j] = dV[comp * compstride + j * Q + tx];
+      }
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// write V of a 2D element from registers rV[][][] to global memory --  for all components of a single dim
+// dV is assumed to be offset by elem-stride and dim-stride
+// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE]
+// i_DIM specifies which dimension is being read from in rV
+// idim specifies which dimension is being written to in dV
+// rV_SIZE can be different from P (e.g. MAXP_Q)
+template <typename T, int Q, int DIM_V, int NUM_COMP, int rV_SIZE, int i_DIM>
+static __device__ __inline__ void writeV_2d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
+  if (tx < Q) {
+    for (int comp = 0; comp < NUM_COMP; comp++) {
+      for (int j = 0; j < Q; j++) {
+        dV[comp * compstride + j * Q + tx] = rV[i_DIM][comp][j];
+      }
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// read U of a 3D element into registers rU[][][] --  for all components of a single dim
+// dU is assumed to be offset by elem-stride and dim-stride
+// register is assumed to be rU[DIM_U][NUM_COMP][rU_SIZE]
+// i_DIM specifies which dimension is being read into in rU
+// rU_SIZE can be different from P (e.g. MAXP_Q)
+// sTmp is a shared memory workspace of size P^3
+template <typename T, int P, int DIM_U, int NUM_COMP, int rU_SIZE, int i_DIM>
+static __device__ __inline__ void readU_3d(const T *dU, const int compstride, T rU[DIM_U][NUM_COMP][rU_SIZE], T *sTmp, const int tx) {
+  // read U as a batch P^2 of (1 x P_) vectors
+  // vec 0    : [u0, u1, u2, ... u_(P-1)] -- contiguous in memory
+  // vec 1    : [u0, u1, u2, ... u_(P-1)] -- contiguous in memory
+  // ...
+  // vec P^2-1: [u0, u1, u2, ... u_(P-1)] -- contiguous in memory
+  // threads collaboratively read vec0 and then vec1 and so on
+  // but for the kernel, we want
+  // thread 0 to hold all of vec0 in registers, and
+  // thread 1 to hold all of vec1 in registers, and and so on
+  // so we need to transpose
+  for (int comp = 0; comp < NUM_COMP; comp++) {
+    // read from global memory into shared memory
+    if (tx < P * P) {
+      for (int i = 0; i < P; i++) {
+        sTmp[i * P * P + tx] = dU[comp * compstride + i * P * P + tx];
+      }
+    }
+    __syncthreads();
+
+    if (tx < P * P) {
+      for (int i = 0; i < P; i++) {
+        rU[i_DIM][comp][i] = sTmp[tx * P + i];
+      }
+    }
+    __syncthreads();
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// read V of a 3D element into registers rV[][][] --  for all components of a single dim
+// dV is assumed to be offset by elem-stride and dim-stride
+// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE]
+// i_DIM specifies which dimension is being read into in rV
+// rV_SIZE can be different from P (e.g. MAXP_Q)
+template <typename T, int Q, int DIM_V, int NUM_COMP, int rV_SIZE, int i_DIM>
+static __device__ __inline__ void readV_3d(const T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
+  if (tx < Q * Q) {
+    for (int comp = 0; comp < NUM_COMP; comp++) {
+      for (int j = 0; j < Q; j++) {
+        rV[i_DIM][comp][j] = dV[comp * compstride + j * (Q * Q) + tx];
+      }
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// write V of a 3D element from registers rV[][][] to global memory --  for all components of a single dim
+// dV is assumed to point directly to the element (i.e. already offset by elem-stride)
+// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE]
+// i_DIM specifies which dimension is being read from in rV
+// idim specifies which dimension is being written to in dV
+// rV_SIZE can be different from P (e.g. MAXP_Q)
+template <typename T, int Q, int DIM_V, int NUM_COMP, int rV_SIZE, int i_DIM>
+static __device__ __inline__ void writeV_3d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
+  if (tx < (Q * Q)) {
+    for (int comp = 0; comp < NUM_COMP; comp++) {
+      for (int j = 0; j < Q; j++) {
+        dV[comp * compstride + j * (Q * Q) + tx] = rV[i_DIM][comp][j];
+      }
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// reads T into shared memory
+// must sync after call
+template <int B, int J>
+static __device__ __inline__ void dread_T_gm2sm(const int tx, const magma_trans_t transT, const CeedScalar *dT, CeedScalar *sT) {
+  if (transT == MagmaNoTrans) {
+    // T is B x J
+    if (tx < B) {
+      for (int i = 0; i < J; i++) {
+        sT[i * B + tx] = dT[i * B + tx];
+      }
+    }
+  } else {
+    // T is J x B
+    if (tx < J) {
+      for (int i = 0; i < B; i++) {
+        sT[tx * B + i] = dT[i * J + tx];
+      }
+    }
+  }
+  // must sync after call
+}
+
+#endif  // CEED_MAGMA_COMMON_TENSOR_H
diff --git a/include/ceed/jit-source/magma/magma_common_nontensor.h b/include/ceed/jit-source/magma/magma_common_nontensor.h
deleted file mode 100644
index edfd805db8..0000000000
--- a/include/ceed/jit-source/magma/magma_common_nontensor.h
+++ /dev/null
@@ -1,166 +0,0 @@
-// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_MAGMA_COMMON_NONTENSOR_H
-#define CEED_MAGMA_COMMON_NONTENSOR_H
-
-#define NONTENSOR_MAX_THREADS (128)
-
-#ifndef MAGMA_DEVICE_SHARED
-#define MAGMA_DEVICE_SHARED
-#ifdef CEED_MAGMA_USE_HIP
-#define MAGMA_DEVICE_SHARED(type, name) HIP_DYNAMIC_SHARED(type, name)
-#else
-#define MAGMA_DEVICE_SHARED(type, name) extern __shared__ type name[];
-#endif  // CEED_MAGMA_USE_HIP
-#endif  // MAGMA_DEVICE_SHARED
-
-#define MAGMA_NONTENSOR_BASIS_NTCOL(N) (MAGMA_MAX(1, (NONTENSOR_MAX_THREADS / (N))))
-
-#define dA(i, j) dA[(j)*ldda + (i)]
-#define sA(i, j) sA[(j)*slda + (i)]
-#define dB(i, j) dB[(j)*lddb + (i)]
-#define sB(i, j) sB[(j)*sldb + (i)]
-
-////////////////////////////////////////////////////////////////////////////////
-// read C from global to reg.
-// C is (P_ x NB_)
-// 1D thread config. with (Mx1) threads
-// no sync at the end of the function
-template <typename T, int P_, int NB_, int Q_>
-static __device__ __inline__ void read_C_g2r_1D_nosync(const int tx, const int n, T *dC, int lddc, const T &beta, T rC[NB_]) {
-  if (n != NB_) {
-#pragma unroll
-    for (int j = 0; j < NB_; j++) {
-      rC[j] = (j < n) ? beta * dC[j * lddc + tx] : 0;
-    }
-  } else {
-#pragma unroll
-    for (int j = 0; j < NB_; j++) {
-      rC[j] = beta * dC[j * lddc + tx];
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// write C from reg. to global
-// C is (P_ x NB_)
-// 1D thread config. with (Mx1) threads
-// no sync at the end of the function
-template <typename T, int P_, int NB_, int Q_>
-static __device__ __inline__ void write_C_r2g_1D_nosync(const int tx, const int n, T rC[NB_], T *dC, int lddc) {
-  if (n != NB_) {
-#pragma unroll
-    for (int j = 0; j < NB_; j++) {
-      if (j < n) {
-        dC[j * lddc + tx] = rC[j];
-      }
-    }
-  } else {
-#pragma unroll
-    for (int j = 0; j < NB_; j++) {
-      dC[j * lddc + tx] = rC[j];
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// read A (no-trans) from global to reg.
-// A is (P_ x Q_)
-// 1D thread config. with (Mx1) threads
-// no sync at the end of the function
-template <typename T, int P_, int NB_, int Q_>
-static __device__ __inline__ void read_A_notrans_g2r_1D_nosync(const int tx, const T *dA, int ldda, T *sA, int slda, T rA[Q_]) {
-#pragma unroll
-  for (int j = 0; j < Q_; j++) {
-    rA[j] = dA(tx, j);
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// read A (no-trans) from global to reg.
-// A is (P_ x Q_)
-// 1D thread config. with (Mx1) threads
-// no sync at the end of the function
-template <typename T, int P_, int NB_, int Q_>
-static __device__ __inline__ void read_A_trans_g2r_1D_nosync(const int tx, const int ty, const T *dA, int ldda, T *sA, int slda, T rA[Q_]) {
-  int       ix  = 0;
-  const int nTH = P_ * MAGMA_NONTENSOR_BASIS_NTCOL(P_);
-  const int tid = ty * blockDim.x + tx;
-
-#pragma unroll
-  for (ix = 0; ix < (Q_ * P_) - nTH; ix += nTH) {
-    sA[ix + tid] = dA[ix + tid];
-  }
-
-  if (tid < ((Q_ * P_) - ix)) {
-    sA[ix + tid] = dA[ix + tid];
-  }
-  __syncthreads();
-
-#pragma unroll
-  for (int j = 0; j < Q_; j++) {
-    rA[j] = sA[tx * slda + j];
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// read B from global to shared
-// B is (Q_ x NB_)
-// 1D thread config. with (Mx1) threads
-// no sync at the end of the function
-template <typename T, int P_, int NB_, int Q_>
-static __device__ __inline__ void read_B_g2s_1D_nosync(const int tx, int n, const T *dB, int lddb, T *sB, int sldb) {
-  if (n != NB_) {
-    for (int i = 0; i < (Q_ * n) - P_; i += P_) {
-      sB[i + tx] = dB[i + tx];
-    }
-  } else {
-#pragma unroll
-    for (int i = 0; i < (Q_ * NB_) - P_; i += P_) {
-      sB[i + tx] = dB[i + tx];
-    }
-  }
-
-  // cleanup for B
-  const int stride = MAGMA_ROUNDUP(Q_ * n - P_, P_);
-  if (tx < (Q_ * n) - stride) {
-    sB[stride + tx] = dB[stride + tx];
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// multiply C = AxB using 1D threads in Mx1 config
-// A (MxK)  in reg., one row per thread
-// B (KxNB) in shared memory
-// C in registers -- one row per thread
-// no sync at the end of the function
-template <typename T, int P_, int NB_, int Q_>
-static __device__ __inline__ void mul_rAsBrC_1D_nosync(const int tx, const T &alpha, T rA[Q_], T *sB, int sldb, T rC[NB_]) {
-  T rB[Q_] = {0};
-#pragma unroll
-  for (int i = 0; i < NB_; i++) {
-#pragma unroll
-    for (int k = 0; k < Q_; k++) {
-      rB[k] = sB[i * sldb + k];
-    }
-
-    T rTmp = 0;
-#pragma unroll
-    for (int k = 0; k < Q_; k++) {
-      rTmp += rA[k] * rB[k];
-    }
-    rC[i] += alpha * rTmp;
-  }
-}
-
-#undef dA
-#undef sA
-#undef dB
-#undef sB
-
-#endif  // CEED_MAGMA_COMMON_NONTENSOR_H
diff --git a/include/ceed/jit-source/magma/magma_common_tensor.h b/include/ceed/jit-source/magma/magma_common_tensor.h
deleted file mode 100644
index 48ad0fa195..0000000000
--- a/include/ceed/jit-source/magma/magma_common_tensor.h
+++ /dev/null
@@ -1,284 +0,0 @@
-// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_MAGMA_COMMON_TENSOR_H
-#define CEED_MAGMA_COMMON_TENSOR_H
-
-#define MAGMA_MAXTHREADS_1D 128
-#define MAGMA_MAXTHREADS_2D 128
-#define MAGMA_MAXTHREADS_3D 64
-// Define macro for determining number of threads in y-direction
-// for basis kernels
-#define MAGMA_BASIS_NTCOL(x, maxt) (((maxt) < (x)) ? 1 : ((maxt) / (x)))
-// Define macro for computing the total threads in a block
-// for use with __launch_bounds__()
-#define MAGMA_BASIS_BOUNDS(x, maxt) (x * MAGMA_BASIS_NTCOL(x, maxt))
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// read U or V of a 1D element into shared memory sU[][] or sV[][] --  for all components
-// the devptr is assumed to point directly to the element
-// must sync after call
-template <typename T, int LENGTH, int NCOMP_>
-__device__ __inline__ void read_1d(const T *devptr, const int compstride, T *sBuffer[NCOMP_], const int tx) {
-  if (tx < LENGTH) {
-    for (int icomp = 0; icomp < NCOMP_; icomp++) {
-      sBuffer[icomp][tx] = devptr[icomp * compstride + tx];
-    }
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// write V of a 1D element into global memory from sV[][] --  for all components
-// the devptr is assumed to point directly to the element
-template <typename T, int LENGTH, int NCOMP_>
-__device__ __inline__ void write_1d(T *sBuffer[NCOMP_], T *devptr, const int compstride, const int tx) {
-  if (tx < LENGTH) {
-    for (int icomp = 0; icomp < NCOMP_; icomp++) {
-      devptr[icomp * compstride + tx] = sBuffer[icomp][tx];
-    }
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// read U of a 2D element into registers rU[][][] --  for all components of a single dim
-// dU is assumed to be offset by elem-stride and dim-stride
-// register is assumed to be rU[DIMU][NCOMP_][rUsize]
-// iDIM specifies which dimension is being read into in rU
-// rUsize can be different from P_ (e.g. MAXP_Q)
-// sTmp is a shared memory workspace of size P_^2
-template <typename T, int P_, int DIMU, int NCOMP_, int rUsize, int iDIM>
-__device__ __inline__ void readU_2d(const T *dU, const int compstride, T rU[DIMU][NCOMP_][rUsize], T *sTmp, const int tx) {
-  // read U as a batch P_ of (1xP_) vectors
-  // vec 0  : [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory
-  // vec 1  : [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory
-  // ...
-  // vec P_-1: [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory
-  // threads collaboratively read vec0 and then vec1 and so on
-  // but for the kernel, we want
-  // thread 0 to hold all of vec0 in registers, and
-  // thread 1 to hold all of vec1 in registers, and and so on
-  // so we need to transpose
-  for (int icomp = 0; icomp < NCOMP_; icomp++) {
-    // read from global memory into shared memory
-    if (tx < P_) {
-      for (int i = 0; i < P_; i++) {
-        sTmp[i * P_ + tx] = dU[icomp * compstride + i * P_ + tx];
-      }
-    }
-    __syncthreads();
-
-    if (tx < P_) {
-      for (int i = 0; i < P_; i++) {
-        rU[iDIM][icomp][i] = sTmp[tx * P_ + i];
-      }
-    }
-    __syncthreads();
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// read V of a 2D element into registers rV[][][] --  for all components of a single dim
-// dV is assumed to be offset by elem-stride and dim-stride
-// register is assumed to be rV[DIMV][NCOMP_][rVsize]
-// iDIM specifies which dimension is being read into in rV
-// rVsize can be different from P_ (e.g. MAXP_Q)
-template <typename T, int Q_, int DIMV, int NCOMP_, int rVsize, int iDIM>
-__device__ __inline__ void readV_2d(const T *dV, const int compstride, T rV[DIMV][NCOMP_][rVsize], const int tx) {
-  if (tx < Q_) {
-    for (int icomp = 0; icomp < NCOMP_; icomp++) {
-      for (int j = 0; j < Q_; j++) {
-        rV[iDIM][icomp][j] = dV[icomp * compstride + j * Q_ + tx];
-      }
-    }
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// write V of a 2D element from registers rV[][][] to global memory --  for all components of a single dim
-// dV is assumed to be offset by elem-stride and dim-stride
-// register is assumed to be rV[DIMV][NCOMP_][rVsize]
-// iDIM specifies which dimension is being read from in rV
-// idim specifies which dimension is being written to in dV
-// rVsize can be different from P_ (e.g. MAXP_Q)
-template <typename T, int Q_, int DIMV, int NCOMP_, int rVsize, int iDIM>
-__device__ __inline__ void writeV_2d(T *dV, const int compstride, T rV[DIMV][NCOMP_][rVsize], const int tx) {
-  if (tx < Q_) {
-    for (int icomp = 0; icomp < NCOMP_; icomp++) {
-      for (int j = 0; j < Q_; j++) {
-        dV[icomp * compstride + j * Q_ + tx] = rV[iDIM][icomp][j];
-      }
-    }
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// read U of a 3D element into registers rU[][][] --  for all components of a single dim
-// dU is assumed to be offset by elem-stride and dim-stride
-// register is assumed to be rU[DIMU][NCOMP_][rUsize]
-// iDIM specifies which dimension is being read into in rU
-// rUsize can be different from P_ (e.g. MAXP_Q)
-// sTmp is a shared memory workspace of size P_^3
-template <typename T, int P_, int DIMU, int NCOMP_, int rUsize, int iDIM>
-__device__ __inline__ void readU_3d(const T *dU, const int compstride, T rU[DIMU][NCOMP_][rUsize], T *sTmp, const int tx) {
-  // read U as a batch P_^2 of (1xP_) vectors
-  // vec 0    : [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory
-  // vec 1    : [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory
-  // ...
-  // vec P_^2-1: [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory
-  // threads collaboratively read vec0 and then vec1 and so on
-  // but for the kernel, we want
-  // thread 0 to hold all of vec0 in registers, and
-  // thread 1 to hold all of vec1 in registers, and and so on
-  // so we need to transpose
-  for (int icomp = 0; icomp < NCOMP_; icomp++) {
-    // read from global memory into shared memory
-    if (tx < P_ * P_) {
-      for (int i = 0; i < P_; i++) {
-        sTmp[i * P_ * P_ + tx] = dU[icomp * compstride + i * P_ * P_ + tx];
-      }
-    }
-    __syncthreads();
-
-    if (tx < P_ * P_) {
-      for (int i = 0; i < P_; i++) {
-        rU[iDIM][icomp][i] = sTmp[tx * P_ + i];
-      }
-    }
-    __syncthreads();
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// read V of a 3D element into registers rV[][][] --  for all components of a single dim
-// dV is assumed to be offset by elem-stride and dim-stride
-// register is assumed to be rV[DIMV][NCOMP_][rVsize]
-// iDIM specifies which dimension is being read into in rV
-// rVsize can be different from P_ (e.g. MAXP_Q)
-template <typename T, int Q_, int DIMV, int NCOMP_, int rVsize, int iDIM>
-__device__ __inline__ void readV_3d(const T *dV, const int compstride, T rV[DIMV][NCOMP_][rVsize], const int tx) {
-  if (tx < Q_ * Q_) {
-    for (int icomp = 0; icomp < NCOMP_; icomp++) {
-      for (int j = 0; j < Q_; j++) {
-        rV[iDIM][icomp][j] = dV[icomp * compstride + j * (Q_ * Q_) + tx];
-      }
-    }
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// write V of a 3D element from registers rV[][][] to global memory --  for all components of a single dim
-// dV is assumed to point directly to the element (i.e. already offset by elem-stride)
-// register is assumed to be rV[DIMV][NCOMP_][rVsize]
-// iDIM specifies which dimension is being read from in rV
-// idim specifies which dimension is being written to in dV
-// rVsize can be different from P_ (e.g. MAXP_Q)
-template <typename T, int Q_, int DIMV, int NCOMP_, int rVsize, int iDIM>
-__device__ __inline__ void writeV_3d(T *dV, const int compstride, T rV[DIMV][NCOMP_][rVsize], const int tx) {
-  if (tx < (Q_ * Q_)) {
-    for (int icomp = 0; icomp < NCOMP_; icomp++) {
-      for (int j = 0; j < Q_; j++) {
-        dV[icomp * compstride + j * (Q_ * Q_) + tx] = rV[iDIM][icomp][j];
-      }
-    }
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// reads T into shared memory
-// must sync after call
-template <int B, int J>
-__device__ __inline__ void dread_T_gm2sm(const int tx, const magma_trans_t transT, const CeedScalar *dT, CeedScalar *sT) {
-  if (transT == MagmaNoTrans) {
-    // T is B x J
-    if (tx < B) {
-      for (int i = 0; i < J; i++) {
-        sT[i * B + tx] = dT[i * B + tx];
-      }
-    }
-  } else {
-    // T is J x B
-    if (tx < J) {
-      for (int i = 0; i < B; i++) {
-        sT[tx * B + i] = dT[i * J + tx];
-      }
-    }
-  }
-  // must sync after call
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// reads a slice of U from shared/global memory into registers
-// the correct pointer U must be precomputed
-template <int B>
-__device__ __inline__ void dread_U_gsm2reg(const int C, const int tx_, const CeedScalar *U, CeedScalar rU[B]) {
-  for (int i = 0; i < B; i++) {
-    rU[i] = U[i * C + tx_];
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// reads a slice of V from shared/global memory into registers with scaling
-// the correct pointer V must be precomputed
-template <int J>
-__device__ __inline__ void dread_V_gsm2reg(const int C, const int tx_, const CeedScalar *V, CeedScalar rV[J]) {
-  for (int i = 0; i < J; i++) {
-    rV[i] = V[i * C + tx_];
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// writes a slice of V from reg to shared/global memory
-// the correct pointer V must be precomputed
-template <int J>
-__device__ __inline__ void dwrite_V_reg2gsm(const int C, const int tx_, CeedScalar rV[J], CeedScalar *V) {
-  for (int i = 0; i < J; i++) {
-    V[i * C + tx_] = rV[i];
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// multiply a slice of U times T to produce a slice of V
-template <int B, int J>
-__device__ __inline__ void dgemm_slice(CeedScalar alpha, CeedScalar *sT, CeedScalar rU[B], CeedScalar beta, CeedScalar rV[J]) {
-  CeedScalar rTmp;
-  for (int j = 0; j < J; j++) {
-    rTmp = 0.0;
-    for (int b = 0; b < B; b++) {
-      rTmp += rU[b] * sT[j * B + b];
-    }
-    rV[j] *= beta;
-    rV[j] += alpha * rTmp;
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-template <int B, int J>
-__device__ __inline__ void dgemm_ceed_device(const int tx, const int A, const int C, magma_trans_t transT, CeedScalar *sT, const CeedScalar alpha,
-                                             const CeedScalar beta, const CeedScalar *dU, CeedScalar *dV, CeedScalar rU[B], CeedScalar rV[J]) {
-  const int tx_      = tx % C;
-  const int slice_id = tx / C;
-
-  // advance pointers for U and V
-  dU += slice_id * C * B;
-  dV += slice_id * C * J;
-
-  // read V if beta is non-zero
-  if (beta != 0.0) {
-    dread_V_gsm2reg<J>(C, tx_, (const CeedScalar *)dV, rV);
-  }
-
-  // read U
-  dread_U_gsm2reg<B>(C, tx_, dU, rU);
-
-  // multiply
-  dgemm_slice<B, J>(alpha, sT, rU, beta, rV);
-
-  // write V back
-  dwrite_V_reg2gsm<J>(C, tx_, rV, dV);
-}
-
-#endif  // CEED_MAGMA_COMMON_TENSOR_H