Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JiT include update #1696

Merged
merged 8 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,7 @@ $(OBJDIR)/%.o : $(CURDIR)/%.sycl.cpp | $$(@D)/.DIR
$(call quiet,SYCLCXX) $(SYCLFLAGS) $(CPPFLAGS) -c -o $@ $(abspath $<)

$(OBJDIR)/%$(EXE_SUFFIX) : tests/%.c | $$(@D)/.DIR
$(call quiet,LINK.c) $(CEED_LDFLAGS) -o $@ $(abspath $<) $(CEED_LIBS) $(CEED_LDLIBS) $(LDLIBS)
$(call quiet,LINK.c) $(CEED_LDFLAGS) -o $@ $(abspath $<) $(CEED_LIBS) $(CEED_LDLIBS) $(LDLIBS) -I./tests/test-include

$(OBJDIR)/%$(EXE_SUFFIX) : tests/%.f90 | $$(@D)/.DIR
$(call quiet,LINK.F) -DSOURCE_DIR='"$(abspath $(<D))/"' $(CEED_LDFLAGS) -o $@ $(abspath $<) $(CEED_LIBS) $(CEED_LDLIBS) $(LDLIBS)
Expand Down
2 changes: 1 addition & 1 deletion backends/avx/ceed-avx-tensor.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include <immintrin.h>
#include <stdbool.h>

#ifdef CEED_F64_H
#ifdef CEED_SCALAR_IS_FP64
#define rtype __m256d
#define loadu _mm256_loadu_pd
#define storeu _mm256_storeu_pd
Expand Down
45 changes: 12 additions & 33 deletions backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -696,42 +696,17 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
CeedCallBackend(CeedGetData(ceed, &ceed_data));
CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id));
if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) {
char *atomic_add_source;
const char *atomic_add_path;

CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-atomic-add-fallback.h", &atomic_add_path));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Atomic Add Source -----\n");
CeedCallBackend(CeedLoadSourceToBuffer(ceed, atomic_add_path, &atomic_add_source));
code << atomic_add_source;
CeedCallBackend(CeedFree(&atomic_add_path));
CeedCallBackend(CeedFree(&atomic_add_source));
code << "// AtomicAdd fallback source\n";
code << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n";
}
}

// Load basis source files
// TODO: Add non-tensor, AtPoints
{
char *tensor_basis_kernel_source;
const char *tensor_basis_kernel_path;

CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h", &tensor_basis_kernel_path));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Tensor Basis Kernel Source -----\n");
CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source));
code << tensor_basis_kernel_source;
CeedCallBackend(CeedFree(&tensor_basis_kernel_path));
CeedCallBackend(CeedFree(&tensor_basis_kernel_source));
}
{
char *cuda_gen_template_source;
const char *cuda_gen_template_path;

CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-gen-templates.h", &cuda_gen_template_path));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Cuda-Gen Template Source -----\n");
CeedCallBackend(CeedLoadSourceToBuffer(ceed, cuda_gen_template_path, &cuda_gen_template_source));
code << cuda_gen_template_source;
CeedCallBackend(CeedFree(&cuda_gen_template_path));
CeedCallBackend(CeedFree(&cuda_gen_template_source));
}
code << "// Tensor basis source\n";
code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
code << "// CodeGen operator source\n";
code << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n";

// Get QFunction name
std::string qfunction_name(qf_data->qfunction_name);
Expand All @@ -749,9 +724,13 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {

// Add user QFunction source
{
std::string qfunction_source(qf_data->qfunction_source);
const char *source_path;

CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file");

code << qfunction_source;
code << "// User QFunction source\n";
code << "#include \"" << source_path << "\"\n\n";
}

// Setup
Expand Down
5 changes: 0 additions & 5 deletions backends/cuda-gen/ceed-cuda-gen-qfunction.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ static int CeedQFunctionDestroy_Cuda_gen(CeedQFunction qf) {

CeedCallBackend(CeedQFunctionGetData(qf, &data));
CeedCallCuda(CeedQFunctionReturnCeed(qf), cudaFree(data->d_c));
CeedCallBackend(CeedFree(&data->qfunction_source));
CeedCallBackend(CeedFree(&data));
return CEED_ERROR_SUCCESS;
}
Expand All @@ -45,10 +44,6 @@ int CeedQFunctionCreate_Cuda_gen(CeedQFunction qf) {

// Read QFunction source
CeedCallBackend(CeedQFunctionGetKernelName(qf, &data->qfunction_name));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source -----\n");
CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &data->qfunction_source));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source Complete! -----\n");
CeedCheck(data->qfunction_source, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file");

CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Cuda_gen));
CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Cuda_gen));
Expand Down
1 change: 0 additions & 1 deletion backends/cuda-gen/ceed-cuda-gen.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ typedef struct {

typedef struct {
const char *qfunction_name;
const char *qfunction_source;
void *d_c;
} CeedQFunction_Cuda_gen;

Expand Down
51 changes: 10 additions & 41 deletions backends/cuda-ref/ceed-cuda-ref-basis.c
Original file line number Diff line number Diff line change
Expand Up @@ -182,24 +182,17 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
}

// -- Compile kernels
char *basis_kernel_source;
const char *basis_kernel_path;
CeedInt num_comp;
const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h>\n";
CeedInt num_comp;

if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h", &basis_kernel_path));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
"BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
CeedCallBackend(CeedFree(&basis_kernel_path));
CeedCallBackend(CeedFree(&basis_kernel_source));
}

// Get read/write access to u, v
Expand Down Expand Up @@ -419,8 +412,6 @@ static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) {
int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
Ceed ceed;
char *basis_kernel_source;
const char *basis_kernel_path;
CeedInt num_comp;
const CeedInt q_bytes = Q_1d * sizeof(CeedScalar);
const CeedInt interp_bytes = q_bytes * P_1d;
Expand All @@ -440,19 +431,15 @@ int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice));

// Compile basis kernels
const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-tensor.h>\n";

CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor.h", &basis_kernel_path));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
"BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim)));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
CeedCallBackend(CeedFree(&basis_kernel_path));
CeedCallBackend(CeedFree(&basis_kernel_source));

CeedCallBackend(CeedBasisSetData(basis, data));

Expand All @@ -471,8 +458,6 @@ int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad,
const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
Ceed ceed;
char *basis_kernel_source;
const char *basis_kernel_path;
CeedInt num_comp, q_comp_interp, q_comp_grad;
const CeedInt q_bytes = num_qpts * sizeof(CeedScalar);
CeedBasisNonTensor_Cuda *data;
Expand Down Expand Up @@ -501,20 +486,16 @@ int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
}

// Compile basis kernels
const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n";

CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
CeedCallBackend(CeedFree(&basis_kernel_path));
CeedCallBackend(CeedFree(&basis_kernel_source));

CeedCallBackend(CeedBasisSetData(basis, data));

Expand All @@ -531,8 +512,6 @@ int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div,
const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
Ceed ceed;
char *basis_kernel_source;
const char *basis_kernel_path;
CeedInt num_comp, q_comp_interp, q_comp_div;
const CeedInt q_bytes = num_qpts * sizeof(CeedScalar);
CeedBasisNonTensor_Cuda *data;
Expand Down Expand Up @@ -561,20 +540,16 @@ int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
}

// Compile basis kernels
const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n";

CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
CeedCallBackend(CeedFree(&basis_kernel_path));
CeedCallBackend(CeedFree(&basis_kernel_source));

CeedCallBackend(CeedBasisSetData(basis, data));

Expand All @@ -591,8 +566,6 @@ int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
Ceed ceed;
char *basis_kernel_source;
const char *basis_kernel_path;
CeedInt num_comp, q_comp_interp, q_comp_curl;
const CeedInt q_bytes = num_qpts * sizeof(CeedScalar);
CeedBasisNonTensor_Cuda *data;
Expand Down Expand Up @@ -621,20 +594,16 @@ int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_no
}

// Compile basis kernels
const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n";

CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
CeedCallBackend(CeedFree(&basis_kernel_path));
CeedCallBackend(CeedFree(&basis_kernel_source));

CeedCallBackend(CeedBasisSetData(basis, data));

Expand Down
Loading