Skip to content

Commit

Permalink
style - more pointer fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremylt committed Aug 13, 2024
1 parent 8e5b6fd commit fbbb68f
Show file tree
Hide file tree
Showing 18 changed files with 125 additions and 125 deletions.
20 changes: 10 additions & 10 deletions include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
if (is_transpose) {
for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride;
const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
CeedInt pre = 1;
CeedInt post = 1;

Expand All @@ -85,7 +85,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
for (CeedInt d = 0; d < BASIS_DIM; d++) {
// Update buffers used
pre /= 1;
const CeedScalar *in = d == 0 ? (cur_u + p) : (d % 2 ? buffer_2 : buffer_1);
const CeedScalar *in = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);

// Build Chebyshev polynomial values
Expand Down Expand Up @@ -134,8 +134,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
} else {
for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride;
const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
CeedInt pre = u_size;
CeedInt post = 1;

Expand Down Expand Up @@ -170,7 +170,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
// Update buffers used
pre /= Q;
const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
CeedScalar *out = d == BASIS_DIM - 1 ? (cur_v + p) : (d % 2 ? buffer_1 : buffer_2);
CeedScalar *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);

// Build Chebyshev polynomial values
ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
Expand Down Expand Up @@ -223,7 +223,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
if (is_transpose) {
for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride;
CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
CeedInt pre = 1;
CeedInt post = 1;

Expand All @@ -236,7 +236,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
__syncthreads();
for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];

pre = 1;
post = 1;
Expand Down Expand Up @@ -294,7 +294,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
} else {
for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
CeedInt pre = u_size;
CeedInt post = 1;

Expand Down Expand Up @@ -324,7 +324,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
__syncthreads();
for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];

pre = BASIS_NUM_QPTS;
post = 1;
Expand Down
8 changes: 4 additions & 4 deletions include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt is_trans
// Apply basis element by element
for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride;
const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
CeedInt pre = u_size;
CeedInt post = 1;

Expand Down Expand Up @@ -107,8 +107,8 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt is_transpo
for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
CeedInt pre = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
CeedInt post = 1;
const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];

for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
__syncthreads();
Expand Down
2 changes: 1 addition & 1 deletion include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__
const CeedInt8 *curl_orients_in, const bool *orients_out, const CeedInt8 *curl_orients_out,
const CeedScalar *__restrict__ qf_array, CeedScalar *__restrict__ values_array) {
extern __shared__ CeedScalar s_CT[];
CeedScalar *s_C = s_CT + NUM_NODES_OUT * NUM_NODES_IN;
CeedScalar *s_C = &s_CT[NUM_NODES_OUT * NUM_NODES_IN];

const int l = threadIdx.x; // The output column index of each B^T D B operation
// such that we have (Bout^T)_ij D_jk Bin_kl = C_il
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ extern "C" __global__ void AtPointsTranspose(const CeedInt *__restrict__ indices

if (loc_node >= points_per_elem[elem]) continue;
for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ extern "C" __global__ void CurlOrientedTranspose(const CeedInt *__restrict__ ind
value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
value +=
loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value);
atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], value);
}
}
}
Expand Down Expand Up @@ -138,7 +138,7 @@ extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt *__restri
value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
value +=
loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value);
atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], value);
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ extern "C" __global__ void OffsetTranspose(const CeedInt *__restrict__ indices,
const CeedInt elem = node / RSTR_ELEM_SIZE;

for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ extern "C" __global__ void OrientedTranspose(const CeedInt *__restrict__ indices
const CeedInt elem = node / RSTR_ELEM_SIZE;

for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
atomicAdd(v + ind + comp * RSTR_COMP_STRIDE,
atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE],
u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0));
}
}
Expand Down
80 changes: 40 additions & 40 deletions include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedSca
template <int NUM_COMP, int P_1D, int Q_1D>
inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
}
}

Expand All @@ -63,7 +63,7 @@ template <int NUM_COMP, int P_1D, int Q_1D>
inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
CeedScalar *__restrict__ r_V) {
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
}
}

Expand All @@ -74,7 +74,7 @@ template <int NUM_COMP, int P_1D, int Q_1D>
inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
CeedScalar *__restrict__ r_V) {
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
}
}

Expand All @@ -85,7 +85,7 @@ template <int NUM_COMP, int P_1D, int Q_1D>
inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
CeedScalar *__restrict__ r_V) {
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
}
}

Expand Down Expand Up @@ -188,8 +188,8 @@ inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *_
CeedScalar *__restrict__ r_V) {
CeedScalar r_t[1];
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
}
}

Expand All @@ -201,8 +201,8 @@ inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const Ceed
CeedScalar *__restrict__ r_V) {
CeedScalar r_t[1];
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
}
}

Expand All @@ -214,10 +214,10 @@ inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__r
CeedScalar *__restrict__ r_V) {
CeedScalar r_t[1];
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_t);
ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp + 0 * NUM_COMP);
ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp + 1 * NUM_COMP);
ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, r_t);
ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t);
ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
}
}

Expand All @@ -229,10 +229,10 @@ inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedSc
CeedScalar *__restrict__ r_V) {
CeedScalar r_t[1];
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 0 * NUM_COMP, c_B, r_t);
ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp);
ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 1 * NUM_COMP, c_G, r_t);
ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, &r_V[comp]);
ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]);
}
}

Expand Down Expand Up @@ -423,9 +423,9 @@ inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *_
CeedScalar r_t1[T_1D];
CeedScalar r_t2[T_1D];
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D);
ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
}
}

Expand All @@ -438,9 +438,9 @@ inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const Ceed
CeedScalar r_t1[T_1D];
CeedScalar r_t2[T_1D];
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D, c_B, r_t1);
ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
}
}

Expand All @@ -453,15 +453,15 @@ inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__r
CeedScalar r_t1[T_1D];
CeedScalar r_t2[T_1D];
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_G, r_t1);
ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D);
ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D);
ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D);
ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
}
}

Expand All @@ -474,15 +474,15 @@ inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedSc
CeedScalar r_t1[T_1D];
CeedScalar r_t2[T_1D];
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_B, r_t1);
ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp * P_1D);
ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_B, r_t1);
ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t1);
ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
}
}

Expand All @@ -495,12 +495,12 @@ inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedS
CeedScalar r_t1[T_1D];
CeedScalar r_t2[T_1D];
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D);
ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D);
ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D);
ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
}
}

Expand All @@ -513,12 +513,12 @@ inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, co
CeedScalar r_t1[T_1D];
CeedScalar r_t2[T_1D];
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t2);
ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_G, r_t2);
ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_G, r_t2);
ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
}
}

Expand Down
Loading

0 comments on commit fbbb68f

Please sign in to comment.