Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AtPoints for */gen #1715

Merged
merged 6 commits into from
Jan 2, 2025
Merged

AtPoints for */gen #1715

merged 6 commits into from
Jan 2, 2025

Conversation

jeremylt
Copy link
Member

@jeremylt jeremylt commented Dec 5, 2024

t590 works, now on to t591-t594

  • Basic skeleton
  • Fix delegation of AtPoints Operator
  • Fix diagonal assembly t594
  • Test 3d
  • HIP
  • Ratel

@jeremylt
Copy link
Member Author

jeremylt commented Dec 12, 2024

Giving wrong results, but the code is compiling:

#define T_1D 5
#include <ceed/jit-source/cuda/cuda-jit.h>

// Tensor basis source
#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>

// AtPoints basis source
#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>

// CodeGen operator source
#include <ceed/jit-source/cuda/cuda-gen-templates.h>


#undef CEED_Q_VLA
#define CEED_Q_VLA 1

// User QFunction source
#include "/home/jeremy/Dev/libCEED/tests/t590-operator.h"


// -----------------------------------------------------------------------------
// Operator Kernel
// 
// d_[in,out]_i:   CeedVector device array
// r_[in,out]_e_i: Element vector register
// r_[in,out]_q_i: Quadrature space vector register
// r_[in,out]_c_i: AtPoints Chebyshev coefficents register
// r_[in,out]_s_i: Quadrature space slice vector register
// 
// s_B_[in,out]_i: Interpolation matrix, shared memory
// s_G_[in,out]_i: Gradient matrix, shared memory
// -----------------------------------------------------------------------------
extern "C" __global__ void CeedKernelCudaGenOperator_mass(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda points) {
  const CeedScalar *d_in_0 = fields.inputs[0];
  CeedScalar *d_out_0 = fields.outputs[0];
  const CeedInt dim = 2;
  const CeedInt Q_1d = 5;
  const CeedInt max_num_points = 4;
  extern __shared__ CeedScalar slice[];
  SharedData_Cuda data;
  data.t_id_x = threadIdx.x;
  data.t_id_y = threadIdx.y;
  data.t_id_z = threadIdx.z;
  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;
  data.slice = slice + data.t_id_z*T_1D*T_1D;

  // Input field constants and basis data
  // -- Input field 0
  const CeedInt P_1d_in_0 = 3;
  const CeedInt num_comp_in_0 = 1;
  // EvalMode: interpolation
  __shared__ CeedScalar s_B_in_0[15];
  LoadMatrix<P_1d_in_0, Q_1d>(data, B.inputs[0], s_B_in_0);

  // Output field constants and basis data
  // -- Output field 0
  const CeedInt P_1d_out_0 = 3;
  const CeedInt num_comp_out_0 = 1;
  // EvalMode: interpolation
  __shared__ CeedScalar s_B_out_0[15];
  LoadMatrix<P_1d_out_0, Q_1d>(data, B.outputs[0], s_B_out_0);

  // Element loop
  __syncthreads();
  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {
    // Scratch restriction buffer space
    CeedScalar r_e_scratch[9];

    // -- Input field restrictions and basis actions
    // ---- Input field 0
    CeedScalar *r_e_in_0 = r_e_scratch;
    const CeedInt l_size_in_0 = 49;
    // CompStride: 1
    ReadLVecStandard2d<num_comp_in_0, 1, P_1d_in_0>(data, l_size_in_0, elem, indices.inputs[0], d_in_0, r_e_in_0);
    // EvalMode: interpolation
    CeedScalar r_c_in_0[num_comp_in_0*1];
    InterpTensor2d<num_comp_in_0, P_1d_in_0, Q_1d>(data, r_e_in_0, s_B_in_0, r_c_in_0);

    // -- Output field setup
    // ---- Output field 0
    CeedScalar r_c_out_0[num_comp_out_0*1];
    for (CeedInt i = 0; i < num_comp_out_0*Q_1d; i++) {
      r_c_out_0[i] = 0.0;
    }

    // Note: Using batches of points
    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * max_num_points / (blockDim.x * blockDim.y));

    #pragma unroll
    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
      const CeedInt p = i % max_num_points;

      // -- Coordinates
      CeedScalar r_x[dim];
      ReadPoint<dim, max_num_points>(data, elem, p, max_num_points, 1, num_elem * max_num_points, max_num_points, points.coords, r_x);

      // -- Input fields
      // ---- Input field 0
      // EvalMode: interpolation
      CeedScalar r_s_in_0[num_comp_in_0];
      InterpAtPoints2d<num_comp_in_0, max_num_points, Q_1d>(data, i, r_c_in_0, r_x, r_s_in_0);

      // -- Output fields
      // ---- Output field 0
      CeedScalar r_s_out_0[num_comp_out_0];

      // -- QFunction inputs and outputs
      // ---- Inputs
      CeedScalar *inputs[1];
      // ------ Input field 0
      inputs[0] = r_s_in_0;
      // ---- Outputs
      CeedScalar *outputs[1];
      // ------ Output field 0
      outputs[0] = r_s_out_0;

      // -- Apply QFunction
      mass(ctx, 1, inputs, outputs);

      // -- Output fields
      // ---- Output field 0
      // EvalMode: interpolation
      if (i > points.num_per_elem[elem]) {
        for (CeedInt j = 0; j < num_comp_out_0; j++) r_s_out_0[j] = 0.0;
      }
      InterpTransposeAtPoints2d<num_comp_out_0, max_num_points, Q_1d>(data, i, r_s_out_0, r_x, r_c_out_0);
    }
    __syncthreads();

    // -- Output field basis action and restrictions
    // ---- Output field 0
    // EvalMode: interpolation
    CeedScalar *r_e_out_0 = r_e_scratch;
    InterpTransposeTensor2d<num_comp_out_0, P_1d_out_0, Q_1d>(data, r_c_out_0, s_B_out_0, r_e_out_0);
    const CeedInt l_size_out_0 = 49;
    // CompStride: 1
    WriteLVecStandard2d<num_comp_out_0, 1, P_1d_out_0>(data, l_size_out_0, elem, indices.outputs[0], r_e_out_0, d_out_0);
  }
}
// -----------------------------------------------------------------------------

@jeremylt jeremylt force-pushed the jeremy/at-points-gen branch 2 times, most recently from 4924280 to 38bf749 Compare December 13, 2024 00:32
@jeremylt
Copy link
Member Author

jeremylt commented Dec 13, 2024

CeedOperator AtPoints
  3 elements with 0 quadrature points each
  5 max points per element
  3 fields
  2 input fields:
    Input field 0:
      Name: "u"
      Size: 1
      EvalMode: interpolation
      Active vector
    Input field 1:
      Name: "rho"
      Size: 1
      EvalMode: none
      No basis
  1 output field:
    Output field 0:
      Name: "v"
      Size: 1
      EvalMode: interpolation
      Active vector
[2] Error in assembly: 0.274158 != 0.352888
[3] Error in assembly: 0.282993 != 0.309660
[4] Error in assembly: 0.198367 != 0.120908
[5] Error in assembly: 0.220000 != 0.526667
[6] Error in assembly: 0.080000 != 0.078730

Only diagonal assembly failing for the core tests - I'm suspicious of the delegation?

Edit: Nope, assembly is correct but true values are incorrect for gen? But t590-t593 passes?

@jeremylt jeremylt force-pushed the jeremy/at-points-gen branch 14 times, most recently from 4f6a4a3 to dbb7f4f Compare December 16, 2024 21:22
@jeremylt jeremylt force-pushed the jeremy/at-points-gen branch 7 times, most recently from 867d284 to 9f6cf23 Compare December 18, 2024 17:44
@jeremylt jeremylt force-pushed the jeremy/at-points-gen branch from 9f6cf23 to 3a2968d Compare December 18, 2024 18:03
@jeremylt jeremylt merged commit 390feb5 into main Jan 2, 2025
26 of 28 checks passed
@jeremylt jeremylt deleted the jeremy/at-points-gen branch January 2, 2025 16:09
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

1 participant