Skip to content

Commit

Permalink
Merge pull request #1696 from CEED/jeremy/jit-include
Browse files Browse the repository at this point in the history
JiT include update
  • Loading branch information
jeremylt authored Oct 21, 2024
2 parents 95f7ac9 + 6a96780 commit 1dc8b1e
Show file tree
Hide file tree
Showing 184 changed files with 529 additions and 663 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,7 @@ $(OBJDIR)/%.o : $(CURDIR)/%.sycl.cpp | $$(@D)/.DIR
$(call quiet,SYCLCXX) $(SYCLFLAGS) $(CPPFLAGS) -c -o $@ $(abspath $<)

$(OBJDIR)/%$(EXE_SUFFIX) : tests/%.c | $$(@D)/.DIR
$(call quiet,LINK.c) $(CEED_LDFLAGS) -o $@ $(abspath $<) $(CEED_LIBS) $(CEED_LDLIBS) $(LDLIBS)
$(call quiet,LINK.c) $(CEED_LDFLAGS) -o $@ $(abspath $<) $(CEED_LIBS) $(CEED_LDLIBS) $(LDLIBS) -I./tests/test-include

$(OBJDIR)/%$(EXE_SUFFIX) : tests/%.f90 | $$(@D)/.DIR
$(call quiet,LINK.F) -DSOURCE_DIR='"$(abspath $(<D))/"' $(CEED_LDFLAGS) -o $@ $(abspath $<) $(CEED_LIBS) $(CEED_LDLIBS) $(LDLIBS)
Expand Down
2 changes: 1 addition & 1 deletion backends/avx/ceed-avx-tensor.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include <immintrin.h>
#include <stdbool.h>

#ifdef CEED_F64_H
#ifdef CEED_SCALAR_IS_FP64
#define rtype __m256d
#define loadu _mm256_loadu_pd
#define storeu _mm256_storeu_pd
Expand Down
45 changes: 12 additions & 33 deletions backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -696,42 +696,17 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
CeedCallBackend(CeedGetData(ceed, &ceed_data));
CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id));
if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) {
char *atomic_add_source;
const char *atomic_add_path;

CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-atomic-add-fallback.h", &atomic_add_path));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Atomic Add Source -----\n");
CeedCallBackend(CeedLoadSourceToBuffer(ceed, atomic_add_path, &atomic_add_source));
code << atomic_add_source;
CeedCallBackend(CeedFree(&atomic_add_path));
CeedCallBackend(CeedFree(&atomic_add_source));
code << "// AtomicAdd fallback source\n";
code << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n";
}
}

// Load basis source files
// TODO: Add non-tensor, AtPoints
{
char *tensor_basis_kernel_source;
const char *tensor_basis_kernel_path;

CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h", &tensor_basis_kernel_path));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Tensor Basis Kernel Source -----\n");
CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source));
code << tensor_basis_kernel_source;
CeedCallBackend(CeedFree(&tensor_basis_kernel_path));
CeedCallBackend(CeedFree(&tensor_basis_kernel_source));
}
{
char *cuda_gen_template_source;
const char *cuda_gen_template_path;

CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-gen-templates.h", &cuda_gen_template_path));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Cuda-Gen Template Source -----\n");
CeedCallBackend(CeedLoadSourceToBuffer(ceed, cuda_gen_template_path, &cuda_gen_template_source));
code << cuda_gen_template_source;
CeedCallBackend(CeedFree(&cuda_gen_template_path));
CeedCallBackend(CeedFree(&cuda_gen_template_source));
}
code << "// Tensor basis source\n";
code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
code << "// CodeGen operator source\n";
code << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n";

// Get QFunction name
std::string qfunction_name(qf_data->qfunction_name);
Expand All @@ -749,9 +724,13 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {

// Add user QFunction source
{
std::string qfunction_source(qf_data->qfunction_source);
const char *source_path;

CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file");

code << qfunction_source;
code << "// User QFunction source\n";
code << "#include \"" << source_path << "\"\n\n";
}

// Setup
Expand Down
5 changes: 0 additions & 5 deletions backends/cuda-gen/ceed-cuda-gen-qfunction.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ static int CeedQFunctionDestroy_Cuda_gen(CeedQFunction qf) {

CeedCallBackend(CeedQFunctionGetData(qf, &data));
CeedCallCuda(CeedQFunctionReturnCeed(qf), cudaFree(data->d_c));
CeedCallBackend(CeedFree(&data->qfunction_source));
CeedCallBackend(CeedFree(&data));
return CEED_ERROR_SUCCESS;
}
Expand All @@ -45,10 +44,6 @@ int CeedQFunctionCreate_Cuda_gen(CeedQFunction qf) {

// Read QFunction source
CeedCallBackend(CeedQFunctionGetKernelName(qf, &data->qfunction_name));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source -----\n");
CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &data->qfunction_source));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source Complete! -----\n");
CeedCheck(data->qfunction_source, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file");

CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Cuda_gen));
CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Cuda_gen));
Expand Down
1 change: 0 additions & 1 deletion backends/cuda-gen/ceed-cuda-gen.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ typedef struct {

typedef struct {
const char *qfunction_name;
const char *qfunction_source;
void *d_c;
} CeedQFunction_Cuda_gen;

Expand Down
51 changes: 10 additions & 41 deletions backends/cuda-ref/ceed-cuda-ref-basis.c
Original file line number Diff line number Diff line change
Expand Up @@ -182,24 +182,17 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
}

// -- Compile kernels
char *basis_kernel_source;
const char *basis_kernel_path;
CeedInt num_comp;
const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h>\n";
CeedInt num_comp;

if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h", &basis_kernel_path));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
"BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
CeedCallBackend(CeedFree(&basis_kernel_path));
CeedCallBackend(CeedFree(&basis_kernel_source));
}

// Get read/write access to u, v
Expand Down Expand Up @@ -419,8 +412,6 @@ static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) {
int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
Ceed ceed;
char *basis_kernel_source;
const char *basis_kernel_path;
CeedInt num_comp;
const CeedInt q_bytes = Q_1d * sizeof(CeedScalar);
const CeedInt interp_bytes = q_bytes * P_1d;
Expand All @@ -440,19 +431,15 @@ int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice));

// Compile basis kernels
const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-tensor.h>\n";

CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor.h", &basis_kernel_path));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
"BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim)));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
CeedCallBackend(CeedFree(&basis_kernel_path));
CeedCallBackend(CeedFree(&basis_kernel_source));

CeedCallBackend(CeedBasisSetData(basis, data));

Expand All @@ -471,8 +458,6 @@ int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad,
const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
Ceed ceed;
char *basis_kernel_source;
const char *basis_kernel_path;
CeedInt num_comp, q_comp_interp, q_comp_grad;
const CeedInt q_bytes = num_qpts * sizeof(CeedScalar);
CeedBasisNonTensor_Cuda *data;
Expand Down Expand Up @@ -501,20 +486,16 @@ int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
}

// Compile basis kernels
const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n";

CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
CeedCallBackend(CeedFree(&basis_kernel_path));
CeedCallBackend(CeedFree(&basis_kernel_source));

CeedCallBackend(CeedBasisSetData(basis, data));

Expand All @@ -531,8 +512,6 @@ int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div,
const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
Ceed ceed;
char *basis_kernel_source;
const char *basis_kernel_path;
CeedInt num_comp, q_comp_interp, q_comp_div;
const CeedInt q_bytes = num_qpts * sizeof(CeedScalar);
CeedBasisNonTensor_Cuda *data;
Expand Down Expand Up @@ -561,20 +540,16 @@ int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
}

// Compile basis kernels
const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n";

CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
CeedCallBackend(CeedFree(&basis_kernel_path));
CeedCallBackend(CeedFree(&basis_kernel_source));

CeedCallBackend(CeedBasisSetData(basis, data));

Expand All @@ -591,8 +566,6 @@ int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
Ceed ceed;
char *basis_kernel_source;
const char *basis_kernel_path;
CeedInt num_comp, q_comp_interp, q_comp_curl;
const CeedInt q_bytes = num_qpts * sizeof(CeedScalar);
CeedBasisNonTensor_Cuda *data;
Expand Down Expand Up @@ -621,20 +594,16 @@ int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_no
}

// Compile basis kernels
const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n";

CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
CeedCallBackend(CeedFree(&basis_kernel_path));
CeedCallBackend(CeedFree(&basis_kernel_source));

CeedCallBackend(CeedBasisSetData(basis, data));

Expand Down
Loading

0 comments on commit 1dc8b1e

Please sign in to comment.