From 747da2ef60accfa78a71077abe2786bc3a532b59 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" <19565938+yhmtsai@users.noreply.github.com> Date: Thu, 26 Oct 2023 23:05:37 +0200 Subject: [PATCH] Add Ginkgo dpcpp into example (#351) This PR adds the ginkgo dpcpp support in the examples: sunmatrix, sunlinsol, and cvode/cv_heat2D I currently only know `queue->wait_and_throw()` to synchronize which requires queue unlike `cudaDeviceSynchronize` or `hipDeviceSynchronize` In sunlinsol and cv_heat2D, some function signatures are also used in other files such that I can not pass an additional parameter. I use global variable to store ginkgo executor and then get the queue when SYCL needs sync or submit the kernel in those functions. --------- Signed-off-by: Yu-Hsiang M. Tsai Co-authored-by: Cody Balos --- .clang-format | 1 - cmake/macros/SundialsAddExamplesGinkgo.cmake | 2 + examples/cvode/ginkgo/CMakeLists.txt | 5 +- .../cvode/ginkgo/cv_heat2D_ginkgo.DPCPP.out | 77 +++ .../cvode/ginkgo/cv_heat2D_ginkgo.HIP.out | 77 +++ examples/cvode/ginkgo/cv_heat2D_ginkgo.cpp | 339 ++++++++--- examples/cvode/ginkgo/cv_heat2D_ginkgo.hpp | 224 +++++--- examples/sunlinsol/ginkgo/CMakeLists.txt | 5 +- .../ginkgo/test_sunlinsol_ginkgo.cpp | 543 ++++++++++++------ examples/sunmatrix/ginkgo/CMakeLists.txt | 5 +- .../ginkgo/test_sunmatrix_ginkgo.cpp | 349 ++++++----- 11 files changed, 1163 insertions(+), 464 deletions(-) create mode 100644 examples/cvode/ginkgo/cv_heat2D_ginkgo.DPCPP.out create mode 100644 examples/cvode/ginkgo/cv_heat2D_ginkgo.HIP.out diff --git a/.clang-format b/.clang-format index 35fa8341a5..c7692ae54e 100644 --- a/.clang-format +++ b/.clang-format @@ -140,7 +140,6 @@ SpacesInConditionalStatement : false SpacesInContainerLiterals : true SpacesInParentheses : false SpacesInSquareBrackets : false -SpaceBeforeSquareBrackets : false Standard : c++14 TabWidth: 2 UseCRLF : false diff --git a/cmake/macros/SundialsAddExamplesGinkgo.cmake b/cmake/macros/SundialsAddExamplesGinkgo.cmake index 1e56b34230..7fc5ef2d5f 100644 --- a/cmake/macros/SundialsAddExamplesGinkgo.cmake +++ b/cmake/macros/SundialsAddExamplesGinkgo.cmake @@ -63,6 +63,8 @@ macro(sundials_add_examples_ginkgo EXAMPLES_VAR) elseif(backend MATCHES "HIP") set_source_files_properties(${example} PROPERTIES LANGUAGE CXX) set(vector nvechip) + elseif(backend MATCHES "DPCPP") + set(vector nvecsycl) elseif(backend MATCHES "OMP") set(vector nvecopenmp) elseif(backend MATCHES "REF") diff --git a/examples/cvode/ginkgo/CMakeLists.txt b/examples/cvode/ginkgo/CMakeLists.txt index c993e377c4..6c876d691c 100644 --- a/examples/cvode/ginkgo/CMakeLists.txt +++ b/examples/cvode/ginkgo/CMakeLists.txt @@ -20,7 +20,7 @@ set(cpu_gpu_examples sundials_add_examples_ginkgo(cpu_gpu_examples TARGETS sundials_cvode - BACKENDS REF OMP CUDA HIP) + BACKENDS REF OMP CUDA HIP DPCPP) # Examples that only support CPU Ginkgo backends set(cpu_examples @@ -39,6 +39,9 @@ if(EXAMPLES_INSTALL) if(SUNDIALS_GINKGO_BACKENDS MATCHES "HIP") list(APPEND vectors nvechip) endif() + if(SUNDIALS_GINKGO_BACKENDS MATCHES "DPCPP") + list(APPEND vectors nvecsycl) + endif() if((SUNDIALS_GINKGO_BACKENDS MATCHES "OMP") OR (SUNDIALS_GINKGO_BACKENDS MATCHES "REF")) list(APPEND vectors nvecserial) diff --git a/examples/cvode/ginkgo/cv_heat2D_ginkgo.DPCPP.out b/examples/cvode/ginkgo/cv_heat2D_ginkgo.DPCPP.out new file mode 100644 index 0000000000..a843e62f28 --- /dev/null +++ b/examples/cvode/ginkgo/cv_heat2D_ginkgo.DPCPP.out @@ -0,0 +1,77 @@ + +2D Heat problem: + ---------------------------- + kx = 1 + ky = 1 + tf = 1 + xu = 1 + yu = 1 + nx = 64 + ny = 64 + dx = 0.015873 + dy = 0.015873 + ---------------------------- + rtol = 0.0001 + atol = 1e-08 + ---------------------------- + lin iters = 20 + eps lin = 0 + ---------------------------- + output = 0 + ---------------------------- + + t ||u||_rms max error + ----------------------------------------------------------------------- + 0.000000000000000e+00 1.273091462283009e+00 0.000000000000000e+00 + 5.000000000000000e-02 1.265953031236337e+00 5.779434661301597e-04 + 1.000000000000000e-01 1.245126467995815e+00 8.596410825743028e-04 + 1.500000000000000e-01 1.212971698816507e+00 1.027071183737238e-03 + 2.000000000000000e-01 1.173149707911348e+00 1.049506292939650e-03 + 2.500000000000000e-01 1.129970993609124e+00 7.767258516966358e-04 + 3.000000000000000e-01 1.088067923761304e+00 3.857233565973672e-04 + 3.500000000000000e-01 1.051569245238569e+00 2.296842605538085e-04 + 4.000000000000000e-01 1.023519508142414e+00 1.160865021105906e-04 + 4.500000000000000e-01 1.005965995289331e+00 3.382124480899584e-05 + 4.999999999999999e-01 9.999934385586851e-01 6.776957530241212e-05 + 5.499999999999999e-01 1.005920028619227e+00 1.074298825753939e-04 + 6.000000000000000e-01 1.023439646225216e+00 1.256532195708093e-04 + 6.500000000000000e-01 1.051474380012092e+00 4.493207354094864e-05 + 7.000000000000001e-01 1.087965937316374e+00 8.048432853913212e-05 + 7.500000000000001e-01 1.129792873621271e+00 2.390181284921411e-04 + 8.000000000000002e-01 1.172918427971992e+00 4.322892993839922e-04 + 8.500000000000002e-01 1.212840417005807e+00 6.887143222911174e-04 + 9.000000000000002e-01 1.245177171528506e+00 9.911108446238881e-04 + 9.500000000000003e-01 1.266240637720725e+00 1.309517693130591e-03 + 1.000000000000000e+00 1.273471195699113e+00 1.189685923645323e-03 + ----------------------------------------------------------------------- + +Final integrator statistics: +Current time = 1 +Steps = 41 +Error test fails = 0 +NLS step fails = 0 +Initial step size = 0.002110117778857815 +Last step size = 0.02437232616233551 +Current step size = 0.02437232616233551 +Last method order = 3 +Current method order = 3 +Stab. lim. order reductions = 0 +RHS fn evals = 52 +NLS iters = 49 +NLS fails = 0 +NLS iters per step = 1.195121951219512 +LS setups = 7 +Jac fn evals = 1 +LS RHS fn evals = 0 +Prec setup evals = 0 +Prec solves = 0 +LS iters = 873 +LS fails = 0 +Jac-times setups = 0 +Jac-times evals = 0 +LS iters per NLS iter = 17.81632653061224 +Jac evals per NLS iter = 0.02040816326530612 +Prec evals per NLS iter = 0 +Root fn evals = 0 + +Max error = 1.189685923645323e-03 diff --git a/examples/cvode/ginkgo/cv_heat2D_ginkgo.HIP.out b/examples/cvode/ginkgo/cv_heat2D_ginkgo.HIP.out new file mode 100644 index 0000000000..4022dba817 --- /dev/null +++ b/examples/cvode/ginkgo/cv_heat2D_ginkgo.HIP.out @@ -0,0 +1,77 @@ + +2D Heat problem: + ---------------------------- + kx = 1 + ky = 1 + tf = 1 + xu = 1 + yu = 1 + nx = 64 + ny = 64 + dx = 0.015873 + dy = 0.015873 + ---------------------------- + rtol = 0.0001 + atol = 1e-08 + ---------------------------- + lin iters = 20 + eps lin = 0 + ---------------------------- + output = 0 + ---------------------------- + + t ||u||_rms max error + ----------------------------------------------------------------------- + 0.000000000000000e+00 1.273091462283009e+00 0.000000000000000e+00 + 5.000000000000000e-02 1.265953031236678e+00 5.779434664550109e-04 + 1.000000000000000e-01 1.245126468025294e+00 8.596397006188639e-04 + 1.500000000000000e-01 1.212971692633635e+00 1.027175433592653e-03 + 2.000000000000000e-01 1.173149607363054e+00 1.048511182224932e-03 + 2.500000000000000e-01 1.129971118809724e+00 7.777031646749588e-04 + 3.000000000000000e-01 1.088068652479702e+00 3.867786296469777e-04 + 3.500000000000000e-01 1.051569453796381e+00 2.291655197173004e-04 + 4.000000000000000e-01 1.023519691519509e+00 1.152507676536185e-04 + 4.500000000000000e-01 1.005966466128100e+00 3.464712602863074e-05 + 4.999999999999999e-01 9.999941074735683e-01 6.549023902890916e-05 + 5.499999999999999e-01 1.005920139252285e+00 1.085862394125670e-04 + 6.000000000000000e-01 1.023440066617863e+00 1.253667245226797e-04 + 6.500000000000000e-01 1.051474489311814e+00 4.368064039983466e-05 + 7.000000000000001e-01 1.087966430721224e+00 8.251704806849780e-05 + 7.500000000000001e-01 1.129793211633907e+00 2.403035068856418e-04 + 8.000000000000002e-01 1.172918720617323e+00 4.322501964297842e-04 + 8.500000000000002e-01 1.212839862652567e+00 6.883283219258907e-04 + 9.000000000000002e-01 1.245175128903723e+00 9.857170108882318e-04 + 9.500000000000003e-01 1.266235911062742e+00 1.301833676780495e-03 + 1.000000000000000e+00 1.273469281873646e+00 1.183015268845011e-03 + ----------------------------------------------------------------------- + +Final integrator statistics: +Current time = 1 +Steps = 41 +Error test fails = 0 +NLS step fails = 0 +Initial step size = 0.002110117764420172 +Last step size = 0.02782878040979117 +Current step size = 0.02782878040979117 +Last method order = 3 +Current method order = 3 +Stab. lim. order reductions = 0 +RHS fn evals = 52 +NLS iters = 49 +NLS fails = 0 +NLS iters per step = 1.195121951219512 +LS setups = 7 +Jac fn evals = 1 +LS RHS fn evals = 0 +Prec setup evals = 0 +Prec solves = 0 +LS iters = 875 +LS fails = 0 +Jac-times setups = 0 +Jac-times evals = 0 +LS iters per NLS iter = 17.85714285714286 +Jac evals per NLS iter = 0.02040816326530612 +Prec evals per NLS iter = 0 +Root fn evals = 0 + +Max error = 1.183015268845011e-03 diff --git a/examples/cvode/ginkgo/cv_heat2D_ginkgo.cpp b/examples/cvode/ginkgo/cv_heat2D_ginkgo.cpp index 2671c77934..e979d60459 100644 --- a/examples/cvode/ginkgo/cv_heat2D_ginkgo.cpp +++ b/examples/cvode/ginkgo/cv_heat2D_ginkgo.cpp @@ -38,8 +38,9 @@ * The spatial derivatives are computed using second-order centered differences, * with the data distributed over nx * ny points on a uniform spatial grid. The * problem is advanced in time with BDF methods using an inexact Newton method - * paired with the CG linear solver from Ginkgo. Several command line options are - * available to change the problem parameters and CVODE settings. Use the flag + * paired with the CG linear solver from Ginkgo. Several command line options + * are available to change the problem parameters and CVODE settings. Use the + * flag * --help for more information. * ---------------------------------------------------------------------------*/ @@ -53,19 +54,23 @@ #if defined(USE_CUDA) #include -#define HIP_OR_CUDA(a, b) b +#define HIP_OR_CUDA_OR_SYCL(a, b, c) b constexpr auto N_VNew = N_VNew_Cuda; #elif defined(USE_HIP) #include -#define HIP_OR_CUDA(a, b) a +#define HIP_OR_CUDA_OR_SYCL(a, b, c) a constexpr auto N_VNew = N_VNew_Hip; +#elif defined(USE_DPCPP) +#include +#define HIP_OR_CUDA_OR_SYCL(a, b, c) c +constexpr auto N_VNew = N_VNew_Sycl; #elif defined(USE_OMP) #include -#define HIP_OR_CUDA(a, b) +#define HIP_OR_CUDA_OR_SYCL(a, b, c) constexpr auto N_VNew = N_VNew_Serial; #else #include -#define HIP_OR_CUDA(a, b) +#define HIP_OR_CUDA_OR_SYCL(a, b, c) constexpr auto N_VNew = N_VNew_Serial; #endif @@ -73,7 +78,8 @@ using GkoMatrixType = gko::matrix::Csr; using GkoSolverType = gko::solver::Cg; using SUNGkoMatrixType = sundials::ginkgo::Matrix; -using SUNGkoSolverType = sundials::ginkgo::LinearSolver; +using SUNGkoSolverType = + sundials::ginkgo::LinearSolver; // ----------------------------------------------------------------------------- // Functions provided to the SUNDIALS integrator @@ -83,7 +89,8 @@ using SUNGkoSolverType = sundials::ginkgo::LinearSolverget_queue(), sunctx); +#else N_Vector u = N_VNew(udata.nodes, sunctx); +#endif if (check_ptr(u, "N_VNew")) return 1; // Set initial condition @@ -120,34 +151,26 @@ int main(int argc, char* argv[]) N_Vector e = N_VClone(u); if (check_ptr(e, "N_VClone")) return 1; - // --------------------------------------- - // Create Ginkgo matrix and linear solver - // --------------------------------------- - -#if defined(USE_CUDA) - auto gko_exec{gko::CudaExecutor::create(0, gko::OmpExecutor::create(), false, gko::allocation_mode::device)}; -#elif defined(USE_HIP) - auto gko_exec{gko::HipExecutor::create(0, gko::OmpExecutor::create(), false, gko::allocation_mode::device)}; -#elif defined(USE_OMP) - auto gko_exec{gko::OmpExecutor::create()}; -#else - auto gko_exec{gko::ReferenceExecutor::create()}; -#endif - auto gko_matrix_dim = gko::dim<2>(udata.nodes, udata.nodes); auto gko_matrix_nnz{(5 * (udata.nx - 2) + 2) * (udata.ny - 2) + 2 * udata.nx}; - auto gko_matrix = gko::share(GkoMatrixType::create(gko_exec, gko_matrix_dim, gko_matrix_nnz)); + auto gko_matrix = + gko::share(GkoMatrixType::create(gko_exec, gko_matrix_dim, gko_matrix_nnz)); SUNGkoMatrixType A{gko_matrix, sunctx}; // Use default stopping criteria - auto crit{sundials::ginkgo::DefaultStop::build().with_max_iters(static_cast(udata.liniters)).on(gko_exec)}; + auto crit{sundials::ginkgo::DefaultStop::build() + .with_max_iters(static_cast(udata.liniters)) + .on(gko_exec)}; // Use Jacobi preconditioner - auto precon{gko::preconditioner::Jacobi::build().on(gko_exec)}; + auto precon{ + gko::preconditioner::Jacobi::build().on(gko_exec)}; - auto gko_solver_factory = gko::share( - GkoSolverType::build().with_criteria(std::move(crit)).with_preconditioner(std::move(precon)).on(gko_exec)); + auto gko_solver_factory = gko::share(GkoSolverType::build() + .with_criteria(std::move(crit)) + .with_preconditioner(std::move(precon)) + .on(gko_exec)); SUNGkoSolverType LS{gko_solver_factory, sunctx}; @@ -206,7 +229,8 @@ int main(int argc, char* argv[]) flag = WriteOutput(t, u, e, udata); if (check_flag(flag, "WriteOutput")) return 1; - for (int iout = 0; iout < udata.nout; iout++) { + for (int iout = 0; iout < udata.nout; iout++) + { // Evolve in time flag = CVode(cvode_mem, tout, u, &t, CV_NORMAL); if (check_flag(flag, "CVode")) break; @@ -239,13 +263,15 @@ int main(int argc, char* argv[]) sunrealtype maxerr = N_VMaxNorm(e); - std::cout << std::scientific << std::setprecision(std::numeric_limits::digits10) + std::cout << std::scientific + << std::setprecision(std::numeric_limits::digits10) << "\nMax error = " << maxerr << std::endl; // -------------------- // Clean up and return // -------------------- + udata.exec = nullptr; CVodeFree(&cvode_mem); // Free integrator memory N_VDestroy(u); // Free vectors N_VDestroy(e); @@ -259,15 +285,19 @@ int main(int argc, char* argv[]) #if defined(USE_CUDA) || defined(USE_HIP) // GPU kernel to compute the ODE RHS function f(t,y). -__global__ void f_kernel(const sunindextype nx, const sunindextype ny, const sunrealtype dx, const sunrealtype dy, - const sunrealtype cx, const sunrealtype cy, const sunrealtype cc, const sunrealtype bx, - const sunrealtype by, const sunrealtype sin_t_cos_t, const sunrealtype cos_sqr_t, - sunrealtype* uarray, sunrealtype* farray) +__global__ void f_kernel(const sunindextype nx, const sunindextype ny, + const sunrealtype dx, const sunrealtype dy, + const sunrealtype cx, const sunrealtype cy, + const sunrealtype cc, const sunrealtype bx, + const sunrealtype by, const sunrealtype sin_t_cos_t, + const sunrealtype cos_sqr_t, sunrealtype* uarray, + sunrealtype* farray) { const sunindextype i = blockIdx.x * blockDim.x + threadIdx.x; const sunindextype j = blockIdx.y * blockDim.y + threadIdx.y; - if (i > 0 && i < nx - 1 && j > 0 && j < ny - 1) { + if (i > 0 && i < nx - 1 && j > 0 && j < ny - 1) + { auto x = i * dx; auto y = j * dy; @@ -284,7 +314,8 @@ __global__ void f_kernel(const sunindextype nx, const sunindextype ny, const sun auto idx_e = (i + 1) + j * nx; auto idx_w = (i - 1) + j * nx; - farray[idx_c] = cc * uarray[idx_c] + cx * (uarray[idx_w] + uarray[idx_e]) + cy * (uarray[idx_s] + uarray[idx_n]) - + farray[idx_c] = cc * uarray[idx_c] + cx * (uarray[idx_w] + uarray[idx_e]) + + cy * (uarray[idx_s] + uarray[idx_n]) - TWO * PI * sin_sqr_x * sin_sqr_y * sin_t_cos_t - bx * (cos_sqr_x - sin_sqr_x) * sin_sqr_y * cos_sqr_t - by * (cos_sqr_y - sin_sqr_y) * sin_sqr_x * cos_sqr_t; @@ -328,14 +359,67 @@ int f(sunrealtype t, N_Vector u, N_Vector f, void* user_data) if (check_ptr(farray, "N_VGetDeviceArrayPointer")) return -1; dim3 threads_per_block{16, 16}; - const auto nbx{(static_cast(nx) + threads_per_block.x - 1) / threads_per_block.x}; - const auto nby{(static_cast(ny) + threads_per_block.y - 1) / threads_per_block.y}; + const auto nbx{(static_cast(nx) + threads_per_block.x - 1) / + threads_per_block.x}; + const auto nby{(static_cast(ny) + threads_per_block.y - 1) / + threads_per_block.y}; dim3 num_blocks{nbx, nby}; - f_kernel<<>>(nx, ny, dx, dy, cx, cy, cc, bx, by, sin_t_cos_t, cos_sqr_t, uarray, farray); + f_kernel<<>>(nx, ny, dx, dy, cx, cy, cc, bx, + by, sin_t_cos_t, cos_sqr_t, + uarray, farray); - HIP_OR_CUDA(hipDeviceSynchronize();, cudaDeviceSynchronize();); + HIP_OR_CUDA_OR_SYCL(hipDeviceSynchronize(), cudaDeviceSynchronize(), ); +#elif defined(USE_DPCPP) + // Access device data arrays + sunrealtype* uarray = N_VGetDeviceArrayPointer(u); + if (check_ptr(uarray, "N_VGetDeviceArrayPointer")) return -1; + + sunrealtype* farray = N_VGetDeviceArrayPointer(f); + if (check_ptr(farray, "N_VGetDeviceArrayPointer")) return -1; + + std::dynamic_pointer_cast(udata->exec) + ->get_queue() + ->submit( + [&](sycl::handler& cgh) + { + cgh.parallel_for(sycl::range<2>(ny, nx), + [=](sycl::id<2> id) + { + const sunindextype i = id[1]; + const sunindextype j = id[0]; + if (i > 0 && i < nx - 1 && j > 0 && j < ny - 1) + { + auto x = i * dx; + auto y = j * dy; + + auto sin_sqr_x = sin(PI * x) * sin(PI * x); + auto sin_sqr_y = sin(PI * y) * sin(PI * y); + + auto cos_sqr_x = cos(PI * x) * cos(PI * x); + auto cos_sqr_y = cos(PI * y) * cos(PI * y); + + // center, north, south, east, and west indices + auto idx_c = i + j * nx; + auto idx_n = i + (j + 1) * nx; + auto idx_s = i + (j - 1) * nx; + auto idx_e = (i + 1) + j * nx; + auto idx_w = (i - 1) + j * nx; + + farray[idx_c] = + cc * uarray[idx_c] + + cx * (uarray[idx_w] + uarray[idx_e]) + + cy * (uarray[idx_s] + uarray[idx_n]) - + TWO * PI * sin_sqr_x * sin_sqr_y * sin_t_cos_t - + bx * (cos_sqr_x - sin_sqr_x) * sin_sqr_y * + cos_sqr_t - + by * (cos_sqr_y - sin_sqr_y) * sin_sqr_x * + cos_sqr_t; + } + }); + }); + udata->exec->synchronize(); #else // Access host data arrays @@ -346,8 +430,10 @@ int f(sunrealtype t, N_Vector u, N_Vector f, void* user_data) if (check_ptr(farray, "N_VGetArrayPointer")) return -1; // Iterate over domain interior and fill the RHS vector - for (sunindextype j = 1; j < ny - 1; j++) { - for (sunindextype i = 1; i < nx - 1; i++) { + for (sunindextype j = 1; j < ny - 1; j++) + { + for (sunindextype i = 1; i < nx - 1; i++) + { auto x = i * dx; auto y = j * dy; @@ -364,7 +450,8 @@ int f(sunrealtype t, N_Vector u, N_Vector f, void* user_data) auto idx_e = (i + 1) + j * nx; auto idx_w = (i - 1) + j * nx; - farray[idx_c] = cc * uarray[idx_c] + cx * (uarray[idx_w] + uarray[idx_e]) + cy * (uarray[idx_s] + uarray[idx_n]) - + farray[idx_c] = cc * uarray[idx_c] + cx * (uarray[idx_w] + uarray[idx_e]) + + cy * (uarray[idx_s] + uarray[idx_n]) - TWO * PI * sin_sqr_x * sin_sqr_y * sin_t_cos_t - bx * (cos_sqr_x - sin_sqr_x) * sin_sqr_y * cos_sqr_t - by * (cos_sqr_y - sin_sqr_y) * sin_sqr_x * cos_sqr_t; @@ -380,12 +467,14 @@ int f(sunrealtype t, N_Vector u, N_Vector f, void* user_data) #if defined(USE_CUDA) || defined(USE_HIP) // GPU kernel to fill southern (j = 0) and northern (j = nx - 1) boundary // entries including the corners. -__global__ void J_sn_kernel(const sunindextype nx, const sunindextype ny, sunindextype* row_ptrs, - sunindextype* col_idxs, sunrealtype* mat_data) +__global__ void J_sn_kernel(const sunindextype nx, const sunindextype ny, + sunindextype* row_ptrs, sunindextype* col_idxs, + sunrealtype* mat_data) { const sunindextype i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= 0 && i < nx) { + if (i >= 0 && i < nx) + { // Southern face mat_data[i] = ZERO; col_idxs[i] = i; @@ -404,12 +493,14 @@ __global__ void J_sn_kernel(const sunindextype nx, const sunindextype ny, sunind // GPU kernel to fill western (i = 0) and eastern (i = nx - 1) boundary entries // excluding the corners (set by J_sn_kernel). -__global__ void J_we_kernel(const sunindextype nx, const sunrealtype ny, sunindextype* row_ptrs, sunindextype* col_idxs, +__global__ void J_we_kernel(const sunindextype nx, const sunrealtype ny, + sunindextype* row_ptrs, sunindextype* col_idxs, sunrealtype* mat_data) { const sunindextype j = blockIdx.x * blockDim.x + threadIdx.x; - if (j > 0 && j < ny - 1) { + if (j > 0 && j < ny - 1) + { // Western face auto col = j * nx; auto idx = (5 * (nx - 2) + 2) * (j - 1) + nx; @@ -427,13 +518,16 @@ __global__ void J_we_kernel(const sunindextype nx, const sunrealtype ny, suninde } // GPU kernel to compute the ODE RHS Jacobian function df/dy(t,y). -__global__ void J_kernel(const sunindextype nx, const sunindextype ny, const sunrealtype cx, const sunrealtype cy, - const sunrealtype cc, sunindextype* row_ptrs, sunindextype* col_idxs, sunrealtype* mat_data) +__global__ void J_kernel(const sunindextype nx, const sunindextype ny, + const sunrealtype cx, const sunrealtype cy, + const sunrealtype cc, sunindextype* row_ptrs, + sunindextype* col_idxs, sunrealtype* mat_data) { const sunindextype i = blockIdx.x * blockDim.x + threadIdx.x; const sunindextype j = blockIdx.y * blockDim.y + threadIdx.y; - if (i > 0 && i < nx - 1 && j > 0 && j < ny - 1) { + if (i > 0 && i < nx - 1 && j > 0 && j < ny - 1) + { auto row = i + j * nx; auto col_s = row - nx; auto col_w = row - 1; @@ -467,7 +561,8 @@ __global__ void J_kernel(const sunindextype nx, const sunindextype ny, const sun // J routine to compute the ODE RHS Jacobian function df/dy(t,y). This // explicitly set boundary entries to zero so J(t,y) has the same sparsity // pattern as A = I - gamma * J(t,y). -int J(sunrealtype t, N_Vector y, N_Vector fy, SUNMatrix J, void* user_data, N_Vector tmp1, N_Vector tmp2, N_Vector tmp3) +int J(sunrealtype t, N_Vector y, N_Vector fy, SUNMatrix J, void* user_data, + N_Vector tmp1, N_Vector tmp2, N_Vector tmp3) { // Access problem data auto udata = static_cast(user_data); @@ -492,35 +587,141 @@ int J(sunrealtype t, N_Vector y, N_Vector fy, SUNMatrix J, void* user_data, N_Ve #if defined(USE_CUDA) || defined(USE_HIP) unsigned threads_per_block_bx = 16; - unsigned num_blocks_bx = ((nx + threads_per_block_bx - 1) / threads_per_block_bx); + unsigned num_blocks_bx = + ((nx + threads_per_block_bx - 1) / threads_per_block_bx); - J_sn_kernel<<>>(nx, ny, row_ptrs, col_idxs, mat_data); + J_sn_kernel<<>>(nx, ny, row_ptrs, + col_idxs, mat_data); unsigned threads_per_block_by = 16; - unsigned num_blocks_by = ((ny + threads_per_block_by - 1) / threads_per_block_by); + unsigned num_blocks_by = + ((ny + threads_per_block_by - 1) / threads_per_block_by); - J_we_kernel<<>>(nx, ny, row_ptrs, col_idxs, mat_data); + J_we_kernel<<>>(nx, ny, row_ptrs, + col_idxs, mat_data); dim3 threads_per_block_i{16, 16}; - const auto nbx{(static_cast(nx) + threads_per_block_i.x - 1) / threads_per_block_i.x}; - const auto nby{(static_cast(ny) + threads_per_block_i.y - 1) / threads_per_block_i.y}; + const auto nbx{(static_cast(nx) + threads_per_block_i.x - 1) / + threads_per_block_i.x}; + const auto nby{(static_cast(ny) + threads_per_block_i.y - 1) / + threads_per_block_i.y}; dim3 num_blocks_i{nbx, nby}; - J_kernel<<>>(nx, ny, cx, cy, cc, row_ptrs, col_idxs, mat_data); - - HIP_OR_CUDA(hipDeviceSynchronize();, cudaDeviceSynchronize();); - + J_kernel<<>>(nx, ny, cx, cy, cc, row_ptrs, + col_idxs, mat_data); + + HIP_OR_CUDA_OR_SYCL(hipDeviceSynchronize(), cudaDeviceSynchronize(), ); +#elif defined(USE_DPCPP) + auto queue = + std::dynamic_pointer_cast(udata->exec)->get_queue(); + // J_sn_kernel + queue->submit( + [&](sycl::handler& cgh) + { + cgh.parallel_for(nx, + [=](sycl::id<1> id) + { + const sunindextype i = id[0]; + + // Southern face + mat_data[i] = ZERO; + col_idxs[i] = i; + row_ptrs[i] = i; + + // Northern face + auto col = i + (ny - 1) * nx; + auto idx = (5 * (nx - 2) + 2) * (ny - 2) + nx + i; + mat_data[idx] = ZERO; + col_idxs[idx] = col; + row_ptrs[col] = idx; + + if (i == nx - 1) + row_ptrs[nx * ny] = (5 * (nx - 2) + 2) * (ny - 2) + + 2 * nx; + }); + }); + // J_we_kernel + queue->submit( + [&](sycl::handler& cgh) + { + cgh.parallel_for(ny, + [=](sycl::id<1> id) + { + const sunindextype j = id[0]; + if (j > 0 && j < ny - 1) + { + // Western face + auto col = j * nx; + auto idx = (5 * (nx - 2) + 2) * (j - 1) + nx; + mat_data[idx] = ZERO; + col_idxs[idx] = col; + row_ptrs[col] = idx; + + // Eastern face + col = (nx - 1) + j * nx; + idx = (5 * (nx - 2) + 2) * (j - 1) + nx + 1 + + 5 * (nx - 2); + mat_data[idx] = ZERO; + col_idxs[idx] = col; + row_ptrs[col] = idx; + } + }); + }); + // J_kernel + queue->submit( + [&](sycl::handler& cgh) + { + cgh.parallel_for(sycl::range<2>(ny, nx), + [=](sycl::id<2> id) + { + const sunindextype i = id[1]; + const sunindextype j = id[0]; + + if (i > 0 && i < nx - 1 && j > 0 && j < ny - 1) + { + auto row = i + j * nx; + auto col_s = row - nx; + auto col_w = row - 1; + auto col_c = row; + auto col_e = row + 1; + auto col_n = row + nx; + + // Number of non-zero entries from preceding rows + auto prior_nnz = (5 * (nx - 2) + 2) * (j - 1) + nx; + + // Starting index for this row + auto idx = prior_nnz + 1 + 5 * (i - 1); + + mat_data[idx] = cy; + mat_data[idx + 1] = cx; + mat_data[idx + 2] = cc; + mat_data[idx + 3] = cx; + mat_data[idx + 4] = cy; + + col_idxs[idx] = col_s; + col_idxs[idx + 1] = col_w; + col_idxs[idx + 2] = col_c; + col_idxs[idx + 3] = col_e; + col_idxs[idx + 4] = col_n; + + row_ptrs[row] = idx; + } + }); + }); + udata->exec->synchronize(); #else // Fill southern boundary entries (j = 0) - for (sunindextype i = 0; i < nx; i++) { + for (sunindextype i = 0; i < nx; i++) + { mat_data[i] = ZERO; col_idxs[i] = i; row_ptrs[i] = i; } // Fill western boundary entries (i = 0) - for (sunindextype j = 1; j < ny - 1; j++) { + for (sunindextype j = 1; j < ny - 1; j++) + { auto col = j * nx; auto idx = (5 * (nx - 2) + 2) * (j - 1) + nx; mat_data[idx] = ZERO; @@ -529,7 +730,8 @@ int J(sunrealtype t, N_Vector y, N_Vector fy, SUNMatrix J, void* user_data, N_Ve } // Fill eastern boundary entries (i = nx - 1) - for (sunindextype j = 1; j < ny - 1; j++) { + for (sunindextype j = 1; j < ny - 1; j++) + { auto col = (nx - 1) + j * nx; auto idx = (5 * (nx - 2) + 2) * (j - 1) + nx + 1 + 5 * (nx - 2); mat_data[idx] = ZERO; @@ -538,7 +740,8 @@ int J(sunrealtype t, N_Vector y, N_Vector fy, SUNMatrix J, void* user_data, N_Ve } // Fill northern boundary entries (j = ny - 1) - for (sunindextype i = 0; i < nx; i++) { + for (sunindextype i = 0; i < nx; i++) + { auto col = i + (ny - 1) * nx; auto idx = (5 * (nx - 2) + 2) * (ny - 2) + nx + i; mat_data[idx] = ZERO; @@ -548,8 +751,10 @@ int J(sunrealtype t, N_Vector y, N_Vector fy, SUNMatrix J, void* user_data, N_Ve row_ptrs[nx * ny] = (5 * (nx - 2) + 2) * (ny - 2) + 2 * nx; // Fill interior entries - for (sunindextype j = 1; j < ny - 1; j++) { - for (sunindextype i = 1; i < nx - 1; i++) { + for (sunindextype j = 1; j < ny - 1; j++) + { + for (sunindextype i = 1; i < nx - 1; i++) + { auto row = i + j * nx; auto col_s = row - nx; auto col_w = row - 1; diff --git a/examples/cvode/ginkgo/cv_heat2D_ginkgo.hpp b/examples/cvode/ginkgo/cv_heat2D_ginkgo.hpp index bc459c68d1..a363985c7b 100644 --- a/examples/cvode/ginkgo/cv_heat2D_ginkgo.hpp +++ b/examples/cvode/ginkgo/cv_heat2D_ginkgo.hpp @@ -15,24 +15,30 @@ * See cv_heat2D_ginkgo.cpp for more information. * ---------------------------------------------------------------------------*/ +#include #include -#include -#include #include +#include +#include #include -#include +#include #include // SUNDIALS types -#include #include +#include #if defined(USE_CUDA) #include #elif defined(USE_HIP) #include +#elif defined(USE_DPCPP) +#include #endif +// Ginkgo Type +#include + // Common utility functions #include @@ -72,19 +78,22 @@ struct UserData sunrealtype dy = yu / (ny - 1); // Integrator settings - sunrealtype rtol = SUN_RCONST(1.0e-4); // relative tolerance - sunrealtype atol = SUN_RCONST(1.0e-8); // absolute tolerance - int maxsteps = 0; // max number of steps between outputs + sunrealtype rtol = SUN_RCONST(1.0e-4); // relative tolerance + sunrealtype atol = SUN_RCONST(1.0e-8); // absolute tolerance + int maxsteps = 0; // max number of steps between outputs // Linear solver settings - int liniters = 20; // number of linear iterations - sunrealtype epslin = ZERO; // linear solver tolerance factor + int liniters = 20; // number of linear iterations + sunrealtype epslin = ZERO; // linear solver tolerance factor // Ouput variables - bool output = false; // write solution to disk - int nout = 20; // number of output times - std::ofstream uout; // output file stream - std::ofstream eout; // error file stream + bool output = false; // write solution to disk + int nout = 20; // number of output times + std::ofstream uout; // output file stream + std::ofstream eout; // error file stream + + // Ginkgo executor for synchronization on sycl + std::shared_ptr exec; }; // ----------------------------------------------------------------------------- @@ -93,11 +102,9 @@ struct UserData #if defined(USE_CUDA) || defined(USE_HIP) // GPU kernel to compute the ODE RHS function f(t,y). -__global__ -void solution_kernel(const sunindextype nx, const sunindextype ny, - const sunrealtype dx, const sunrealtype dy, - const sunrealtype cos_sqr_t, - sunrealtype* uarray) +__global__ void solution_kernel(const sunindextype nx, const sunindextype ny, + const sunrealtype dx, const sunrealtype dy, + const sunrealtype cos_sqr_t, sunrealtype* uarray) { const sunindextype i = blockIdx.x * blockDim.x + threadIdx.x; const sunindextype j = blockIdx.y * blockDim.y + threadIdx.y; @@ -110,14 +117,14 @@ void solution_kernel(const sunindextype nx, const sunindextype ny, auto sin_sqr_x = sin(PI * x) * sin(PI * x); auto sin_sqr_y = sin(PI * y) * sin(PI * y); - auto idx = i + j * nx; + auto idx = i + j * nx; uarray[idx] = sin_sqr_x * sin_sqr_y * cos_sqr_t + ONE; } } #endif // Compute the exact solution -int Solution(sunrealtype t, N_Vector u, UserData &udata) +int Solution(sunrealtype t, N_Vector u, UserData& udata) { // Access problem data and set shortcuts const auto nx = udata.nx; @@ -133,22 +140,49 @@ int Solution(sunrealtype t, N_Vector u, UserData &udata) #if defined(USE_CUDA) || defined(USE_HIP) - sunrealtype *uarray = N_VGetDeviceArrayPointer(u); + sunrealtype* uarray = N_VGetDeviceArrayPointer(u); if (check_ptr(uarray, "N_VGetDeviceArrayPointer")) return -1; dim3 threads_per_block{16, 16}; - const auto nbx{(static_cast(nx) + threads_per_block.x - 1) - / threads_per_block.x}; - const auto nby{(static_cast(ny) + threads_per_block.y - 1) - / threads_per_block.y}; + const auto nbx{(static_cast(nx) + threads_per_block.x - 1) / + threads_per_block.x}; + const auto nby{(static_cast(ny) + threads_per_block.y - 1) / + threads_per_block.y}; dim3 num_blocks{nbx, nby}; - solution_kernel<<>> - (nx, ny, dx, dy, cos_sqr_t, uarray); - + solution_kernel<<>>(nx, ny, dx, dy, cos_sqr_t, + uarray); +#elif defined(USE_DPCPP) + sunrealtype* uarray = N_VGetDeviceArrayPointer(u); + if (check_ptr(uarray, "N_VGetDeviceArrayPointer")) return -1; + std::dynamic_pointer_cast(udata.exec) + ->get_queue() + ->submit( + [&](sycl::handler& cgh) + { + cgh.parallel_for(sycl::range<2>(ny, nx), + [=](sycl::id<2> id) + { + const sunindextype i = id[1]; + const sunindextype j = id[0]; + + if (i > 0 && i < nx - 1 && j > 0 && j < ny - 1) + { + auto x = i * dx; + auto y = j * dy; + + auto sin_sqr_x = sin(PI * x) * sin(PI * x); + auto sin_sqr_y = sin(PI * y) * sin(PI * y); + + auto idx = i + j * nx; + uarray[idx] = sin_sqr_x * sin_sqr_y * cos_sqr_t + + ONE; + } + }); + }); #else - sunrealtype *uarray = N_VGetArrayPointer(u); + sunrealtype* uarray = N_VGetArrayPointer(u); if (check_ptr(uarray, "N_VGetArrayPointer")) return -1; for (sunindextype j = 1; j < ny - 1; j++) @@ -161,18 +195,18 @@ int Solution(sunrealtype t, N_Vector u, UserData &udata) auto sin_sqr_x = sin(PI * x) * sin(PI * x); auto sin_sqr_y = sin(PI * y) * sin(PI * y); - auto idx = i + j * nx; + auto idx = i + j * nx; uarray[idx] = sin_sqr_x * sin_sqr_y * cos_sqr_t + ONE; } } #endif - + udata.exec->synchronize(); return 0; } // Compute the solution error -int SolutionError(sunrealtype t, N_Vector u, N_Vector e, UserData &udata) +int SolutionError(sunrealtype t, N_Vector u, N_Vector e, UserData& udata) { // Compute true solution int flag = Solution(t, e, udata); @@ -188,29 +222,28 @@ int SolutionError(sunrealtype t, N_Vector u, N_Vector e, UserData &udata) // Print command line options void InputHelp() { - std::cout - << std::endl - << "Command line options:\n" - << " --nx : number of x mesh points\n" - << " --nx : number of y mesh points\n" - << " --xu : x upper bound\n" - << " --yu : y upper bound\n" - << " --kx : x diffusion coefficient\n" - << " --kx : y diffusion coefficient\n" - << " --tf