AtPoints for */gen #1715

jeremylt · 2024-12-05T21:51:41Z

t590 works, now on to t591-t594

jeremylt · 2024-12-12T21:08:17Z

Giving wrong results, but the code is compiling:

#define T_1D 5
#include <ceed/jit-source/cuda/cuda-jit.h>

// Tensor basis source
#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>

// AtPoints basis source
#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>

// CodeGen operator source
#include <ceed/jit-source/cuda/cuda-gen-templates.h>


#undef CEED_Q_VLA
#define CEED_Q_VLA 1

// User QFunction source
#include "/home/jeremy/Dev/libCEED/tests/t590-operator.h"


// -----------------------------------------------------------------------------
// Operator Kernel
// 
// d_[in,out]_i:   CeedVector device array
// r_[in,out]_e_i: Element vector register
// r_[in,out]_q_i: Quadrature space vector register
// r_[in,out]_c_i: AtPoints Chebyshev coefficents register
// r_[in,out]_s_i: Quadrature space slice vector register
// 
// s_B_[in,out]_i: Interpolation matrix, shared memory
// s_G_[in,out]_i: Gradient matrix, shared memory
// -----------------------------------------------------------------------------
extern "C" __global__ void CeedKernelCudaGenOperator_mass(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda points) {
  const CeedScalar *d_in_0 = fields.inputs[0];
  CeedScalar *d_out_0 = fields.outputs[0];
  const CeedInt dim = 2;
  const CeedInt Q_1d = 5;
  const CeedInt max_num_points = 4;
  extern __shared__ CeedScalar slice[];
  SharedData_Cuda data;
  data.t_id_x = threadIdx.x;
  data.t_id_y = threadIdx.y;
  data.t_id_z = threadIdx.z;
  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;
  data.slice = slice + data.t_id_z*T_1D*T_1D;

  // Input field constants and basis data
  // -- Input field 0
  const CeedInt P_1d_in_0 = 3;
  const CeedInt num_comp_in_0 = 1;
  // EvalMode: interpolation
  __shared__ CeedScalar s_B_in_0[15];
  LoadMatrix<P_1d_in_0, Q_1d>(data, B.inputs[0], s_B_in_0);

  // Output field constants and basis data
  // -- Output field 0
  const CeedInt P_1d_out_0 = 3;
  const CeedInt num_comp_out_0 = 1;
  // EvalMode: interpolation
  __shared__ CeedScalar s_B_out_0[15];
  LoadMatrix<P_1d_out_0, Q_1d>(data, B.outputs[0], s_B_out_0);

  // Element loop
  __syncthreads();
  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {
    // Scratch restriction buffer space
    CeedScalar r_e_scratch[9];

    // -- Input field restrictions and basis actions
    // ---- Input field 0
    CeedScalar *r_e_in_0 = r_e_scratch;
    const CeedInt l_size_in_0 = 49;
    // CompStride: 1
    ReadLVecStandard2d<num_comp_in_0, 1, P_1d_in_0>(data, l_size_in_0, elem, indices.inputs[0], d_in_0, r_e_in_0);
    // EvalMode: interpolation
    CeedScalar r_c_in_0[num_comp_in_0*1];
    InterpTensor2d<num_comp_in_0, P_1d_in_0, Q_1d>(data, r_e_in_0, s_B_in_0, r_c_in_0);

    // -- Output field setup
    // ---- Output field 0
    CeedScalar r_c_out_0[num_comp_out_0*1];
    for (CeedInt i = 0; i < num_comp_out_0*Q_1d; i++) {
      r_c_out_0[i] = 0.0;
    }

    // Note: Using batches of points
    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * max_num_points / (blockDim.x * blockDim.y));

    #pragma unroll
    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
      const CeedInt p = i % max_num_points;

      // -- Coordinates
      CeedScalar r_x[dim];
      ReadPoint<dim, max_num_points>(data, elem, p, max_num_points, 1, num_elem * max_num_points, max_num_points, points.coords, r_x);

      // -- Input fields
      // ---- Input field 0
      // EvalMode: interpolation
      CeedScalar r_s_in_0[num_comp_in_0];
      InterpAtPoints2d<num_comp_in_0, max_num_points, Q_1d>(data, i, r_c_in_0, r_x, r_s_in_0);

      // -- Output fields
      // ---- Output field 0
      CeedScalar r_s_out_0[num_comp_out_0];

      // -- QFunction inputs and outputs
      // ---- Inputs
      CeedScalar *inputs[1];
      // ------ Input field 0
      inputs[0] = r_s_in_0;
      // ---- Outputs
      CeedScalar *outputs[1];
      // ------ Output field 0
      outputs[0] = r_s_out_0;

      // -- Apply QFunction
      mass(ctx, 1, inputs, outputs);

      // -- Output fields
      // ---- Output field 0
      // EvalMode: interpolation
      if (i > points.num_per_elem[elem]) {
        for (CeedInt j = 0; j < num_comp_out_0; j++) r_s_out_0[j] = 0.0;
      }
      InterpTransposeAtPoints2d<num_comp_out_0, max_num_points, Q_1d>(data, i, r_s_out_0, r_x, r_c_out_0);
    }
    __syncthreads();

    // -- Output field basis action and restrictions
    // ---- Output field 0
    // EvalMode: interpolation
    CeedScalar *r_e_out_0 = r_e_scratch;
    InterpTransposeTensor2d<num_comp_out_0, P_1d_out_0, Q_1d>(data, r_c_out_0, s_B_out_0, r_e_out_0);
    const CeedInt l_size_out_0 = 49;
    // CompStride: 1
    WriteLVecStandard2d<num_comp_out_0, 1, P_1d_out_0>(data, l_size_out_0, elem, indices.outputs[0], r_e_out_0, d_out_0);
  }
}
// -----------------------------------------------------------------------------

jeremylt · 2024-12-13T00:58:13Z

CeedOperator AtPoints
  3 elements with 0 quadrature points each
  5 max points per element
  3 fields
  2 input fields:
    Input field 0:
      Name: "u"
      Size: 1
      EvalMode: interpolation
      Active vector
    Input field 1:
      Name: "rho"
      Size: 1
      EvalMode: none
      No basis
  1 output field:
    Output field 0:
      Name: "v"
      Size: 1
      EvalMode: interpolation
      Active vector
[2] Error in assembly: 0.274158 != 0.352888
[3] Error in assembly: 0.282993 != 0.309660
[4] Error in assembly: 0.198367 != 0.120908
[5] Error in assembly: 0.220000 != 0.526667
[6] Error in assembly: 0.080000 != 0.078730

Only diagonal assembly failing for the core tests - I'm suspicious of the delegation?

Edit: Nope, assembly is correct but true values are incorrect for gen? But t590-t593 passes?

jeremylt added enhancement GPU performance CUDA 0-WIP HIP labels Dec 5, 2024

jeremylt self-assigned this Dec 5, 2024

jeremylt force-pushed the jeremy/at-points-gen branch 3 times, most recently from f84f050 to 368f613 Compare December 11, 2024 20:44

jeremylt force-pushed the jeremy/at-points-gen branch 2 times, most recently from 4924280 to 38bf749 Compare December 13, 2024 00:32

jeremylt force-pushed the jeremy/at-points-gen branch 14 times, most recently from 4f6a4a3 to dbb7f4f Compare December 16, 2024 21:22

jeremylt added 2 commits December 16, 2024 16:29

cuda - AtPoints for cuda/gen

8b97b69

pc - fix fallback creation for AtPoints operator

bcd9268

jeremylt added 3 commits December 16, 2024 16:29

minor - fix view for AtPoints operators

99f7f61

gpu - minor fix to 1d AtPoints basis transpose

4eda27c

minor - fix style in gen templates

688b547

jeremylt force-pushed the jeremy/at-points-gen branch 7 times, most recently from 867d284 to 9f6cf23 Compare December 18, 2024 17:44

hip - AtPoints for hip/gen

3a2968d

jeremylt force-pushed the jeremy/at-points-gen branch from 9f6cf23 to 3a2968d Compare December 18, 2024 18:03

jeremylt added 1-In Review and removed 0-WIP labels Dec 18, 2024

jeremylt merged commit 390feb5 into main Jan 2, 2025
26 of 28 checks passed

jeremylt deleted the jeremy/at-points-gen branch January 2, 2025 16:09

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

AtPoints for */gen #1715

AtPoints for */gen #1715

jeremylt commented Dec 5, 2024 •

edited

Loading

jeremylt commented Dec 12, 2024 •

edited

Loading

jeremylt commented Dec 13, 2024 •

edited

Loading

AtPoints for */gen #1715

AtPoints for */gen #1715

Conversation

jeremylt commented Dec 5, 2024 • edited Loading

jeremylt commented Dec 12, 2024 • edited Loading

jeremylt commented Dec 13, 2024 • edited Loading

jeremylt commented Dec 5, 2024 •

edited

Loading

jeremylt commented Dec 12, 2024 •

edited

Loading

jeremylt commented Dec 13, 2024 •

edited

Loading