WIP: avoid copying from gpu to cpu (and vice-versa) when solving ever…

…ything on the GPU
rsachetto · Nov 24, 2023 · a20c4fd · a20c4fd
1 parent a95a918
commit a20c4fd
Show file tree

Hide file tree

Showing 9 changed files with 156 additions and 78 deletions.
diff --git a/src/alg/grid/grid.h b/src/alg/grid/grid.h
@@ -39,6 +39,9 @@ struct grid {
 
     struct point_3d start_discretization;
     struct point_3d max_discretization;
+
+    float *gpu_rhs;
+
 };
 
 struct grid* new_grid();

diff --git a/src/config/linear_system_solver_config.h b/src/config/linear_system_solver_config.h
@@ -8,10 +8,11 @@
 #include "../alg/grid/grid.h"
 #include "../monodomain/constants.h"
 #include "config_common.h"
+#include "../ode_solver/ode_solver.h"
 
-#define SOLVE_LINEAR_SYSTEM(name)                                                                                      \
-              void name(struct time_info *time_info, struct config *config, struct grid *the_grid,                     \
-                        uint32_t num_active_cells, struct cell_node **active_cells,                                    \
+#define SOLVE_LINEAR_SYSTEM(name)                                                                                        \
+              float *name(struct time_info *time_info, struct config *config, struct grid *the_grid,                     \
+                        uint32_t num_active_cells, struct cell_node **active_cells,                                      \
                         uint32_t *number_of_iterations, real_cpu *error)
 typedef SOLVE_LINEAR_SYSTEM(linear_system_solver_fn);
 

diff --git a/src/config/update_monodomain_config.h b/src/config/update_monodomain_config.h
@@ -16,7 +16,7 @@ struct monodomain_solver;
      void name(struct time_info *time_info, struct config *config, struct grid *the_grid,                              \
                struct monodomain_solver *the_solver,                                                                   \
                uint32_t num_active_cells, struct cell_node ** active_cells,                                            \
-               struct ode_solver *the_ode_solver, uint32_t initial_number_of_cells)
+               struct ode_solver *the_ode_solver, uint32_t initial_number_of_cells, bool edp_gpu)
 
 
 typedef UPDATE_MONODOMAIN(update_monodomain_fn);

diff --git a/src/gpu_utils/gpu_utils.cu b/src/gpu_utils/gpu_utils.cu
@@ -1,20 +1,32 @@
 #include "gpu_utils.h"
 
-__global__ void gpu_ecg_integral_kernel(const real *beta_im, const real* distances, const real *volumes, int n, real *result);
-__global__ void kernel_gpu_vec_div_vec(real *vec1, real *vec2, real *vec3, size_t n);
+__global__ void kernel_gpu_vec_div_vec(real *vec1, real *vec2, real *vec3, size_t n) {
+
+    unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if(i < n) {
+        vec3[i] = vec1[i]/vec2[i];
+    }
+}
 
-    extern "C" void gpu_vec_div_vec(real *vec1, real *vec2, real *res, size_t n) {
+extern "C" void gpu_vec_div_vec(real *vec1, real *vec2, real *res, size_t n) {
     const int GRID  = (n + BLOCK_SIZE - 1)/BLOCK_SIZE;
     kernel_gpu_vec_div_vec<<<GRID, BLOCK_SIZE>>>(vec1, vec2, res, n);
     cudaDeviceSynchronize();
 }
 
-__global__ void kernel_gpu_vec_div_vec(real *vec1, real *vec2, real *vec3, size_t n) {
-
+
+__global__ void kernel_update_monodomain(real *vm, float *b, real alpha, size_t n) {
+
     unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
 
     if(i < n) {
-        vec3[i] = vec1[i]/vec2[i];
+        b[i] = vm[i] * alpha;
     }
 }
 
+extern "C" void gpu_update_monodomain(real *vm, float *b, real alpha, size_t n) {
+    const int GRID  = (n + BLOCK_SIZE - 1)/BLOCK_SIZE;
+    kernel_update_monodomain<<<GRID, BLOCK_SIZE>>>(vm, b, alpha, n);
+    cudaDeviceSynchronize();
+}
diff --git a/src/gpu_utils/gpu_utils.h b/src/gpu_utils/gpu_utils.h
@@ -35,6 +35,7 @@ extern "C" {
 #endif
 void gpu_vec_div_vec(real *vec1, real *vec2, real* res, size_t n);
 real gpu_ecg_integral(real *beta_im, real *distances, real *volumes, size_t vec_size);
+void gpu_update_monodomain(real *vm, float *b, real alpha, size_t n);
 void cuda_assert(int code, char const *const func, const char *const file, int const line, const char *api);
 #ifdef __cplusplus
 }

diff --git a/src/linear_system_solver_library/gpu_solvers_cublas_12.c b/src/linear_system_solver_library/gpu_solvers_cublas_12.c
@@ -28,15 +28,15 @@ struct gpu_persistent_data {
     float *d_valsILU0;
     float *d_zm1, *d_zm2, *d_rm2;
     float *d_y;
-    
+
     int                 bufferSizeLU;
     size_t              bufferSizeMV, bufferSizeL, bufferSizeU;
     void*               d_bufferLU, *d_bufferMV,  *d_bufferL, *d_bufferU;
     cusparseSpSVDescr_t spsvDescrL, spsvDescrU;
     cusparseMatDescr_t   matLU;
     csrilu02Info_t      infoILU;
 
-    cusparseSpMatDescr_t matM_lower, matM_upper;  
+    cusparseSpMatDescr_t matM_lower, matM_upper;
 
 };
 
@@ -82,7 +82,7 @@ INIT_LINEAR_SYSTEM(init_gpu_conjugate_gradient) {
     check_cuda_error(cudaMalloc((void **)&(persistent_data->d_row), (N + 1) * sizeof(int)));
     check_cuda_error(cudaMalloc((void **)&(persistent_data->d_val), nz * sizeof(float)));
     check_cuda_error(cudaMalloc((void **)&(persistent_data->d_x), N * sizeof(float)));
-    check_cuda_error(cudaMalloc((void **)&(persistent_data->d_r), N * sizeof(float)));
+    //check_cuda_error(cudaMalloc((void **)&(persistent_data->d_r), N * sizeof(float)));
     check_cuda_error(cudaMalloc((void **)&(persistent_data->d_p), N * sizeof(float)));
     check_cuda_error(cudaMalloc((void **)&(persistent_data->d_Ax), N * sizeof(float)));
 
@@ -97,6 +97,8 @@ INIT_LINEAR_SYSTEM(init_gpu_conjugate_gradient) {
     cudaMemcpy(persistent_data->d_val, val, nz * sizeof(float), cudaMemcpyHostToDevice);  // A
     float *rhs = (float *)malloc(sizeof(float) * num_active_cells);
 
+
+    //TODO:can we remove this?
     OMP(parallel for)
     for(uint32_t i = 0; i < num_active_cells; i++) {
         rhs[i] = active_cells[i]->b;
@@ -106,7 +108,7 @@ INIT_LINEAR_SYSTEM(init_gpu_conjugate_gradient) {
 
     float alpha = 1.0f;
     float beta = 0.0f;
-    
+
     check_cuda_error(cusparseSpMV_bufferSize(persistent_data->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, persistent_data->matA, persistent_data->vecx, &beta, persistent_data->vecAx,
                      CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, &(persistent_data->bufferSizeMV)));
     check_cuda_error(cudaMalloc(&(persistent_data->d_bufferMV), persistent_data->bufferSizeMV));
@@ -136,7 +138,7 @@ END_LINEAR_SYSTEM(end_gpu_conjugate_gradient) {
     check_cuda_error(cudaFree(persistent_data->d_row));
     check_cuda_error(cudaFree(persistent_data->d_val));
     check_cuda_error(cudaFree(persistent_data->d_x));
-    check_cuda_error(cudaFree(persistent_data->d_r));
+    //check_cuda_error(cudaFree(persistent_data->d_r));
     check_cuda_error(cudaFree(persistent_data->d_p));
     check_cuda_error(cudaFree(persistent_data->d_Ax));
     check_cuda_error(cudaFree(persistent_data->d_y));
@@ -163,22 +165,16 @@ SOLVE_LINEAR_SYSTEM(gpu_conjugate_gradient) {
     int k;
     float alpha, alpham1, beta;
 
-    float *rhs; // Vector B
-    rhs = (float *)malloc(sizeof(float) * num_active_cells);
 
-    OMP(parallel for)
-    for(uint32_t i = 0; i < num_active_cells; i++) {
-        rhs[i] = active_cells[i]->b;
-    }
-
-    int N = persistent_data->N;    
+    int N = persistent_data->N;
 
-    cudaMemcpy(persistent_data->d_r, rhs, N * sizeof(float), cudaMemcpyHostToDevice); // B
 
     alpha = 1.0f;
     alpham1 = -1.0f;
     beta = 0.0f;
-    r0 = 0.0f;    
+    r0 = 0.0f;
+
+    persistent_data->d_r = the_grid->gpu_rhs;
 
     check_cuda_error(cusparseSpMV(persistent_data->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, persistent_data->matA, persistent_data->vecx, &beta, persistent_data->vecAx, CUDA_R_32F,
                                   CUSPARSE_SPMV_ALG_DEFAULT, persistent_data->d_bufferMV));
@@ -194,14 +190,14 @@ SOLVE_LINEAR_SYSTEM(gpu_conjugate_gradient) {
             b = r1 / r0;
             cublasSscal(persistent_data->cublasHandle, N, &b, persistent_data->d_p, 1);
             cublasSaxpy(persistent_data->cublasHandle, N, &alpha, persistent_data->d_r, 1, persistent_data->d_p, 1);
- 
+
         } else {
             cublasScopy(persistent_data->cublasHandle, N, persistent_data->d_r, 1, persistent_data->d_p, 1);
         }
 
         check_cuda_error(cusparseSpMV(persistent_data->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, persistent_data->matA, persistent_data->vecp, &beta, persistent_data->vecAx, CUDA_R_32F,
                                       CUSPARSE_SPMV_ALG_DEFAULT, persistent_data->d_bufferMV));
-        
+
         cublasSdot(persistent_data->cublasHandle, N, persistent_data->d_p, 1, persistent_data->d_Ax, 1, &dot);
 
         cublasSdot(persistent_data->cublasHandle, N, persistent_data->d_p, 1, persistent_data->d_Ax, 1, &dot);
@@ -219,17 +215,12 @@ SOLVE_LINEAR_SYSTEM(gpu_conjugate_gradient) {
         k++;
     }
 
-    cudaMemcpy(rhs, persistent_data->d_x, N * sizeof(float), cudaMemcpyDeviceToHost);
-
     *number_of_iterations = k - 1;
     *error = r1;
 
-    OMP(parallel for)
-    for(uint32_t i = 0; i < num_active_cells; i++) {
-        active_cells[i]->v = rhs[i];
-    }
 
-    free(rhs);
+    return persistent_data->d_x;
+
 }
 
 
@@ -291,7 +282,7 @@ INIT_LINEAR_SYSTEM(init_gpu_biconjugate_gradient) {
 
     check_cuda_error(cusparseCreateCsr(&(persistent_data->matA), N, N, nz, persistent_data->d_row, persistent_data->d_col, persistent_data->d_val,
                                        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
-    
+
     check_cuda_error(cusparseCreateDnVec(&(persistent_data->vecx), N, persistent_data->d_x, CUDA_R_32F));
     check_cuda_error(cusparseCreateDnVec(&(persistent_data->vecp), N, persistent_data->d_p, CUDA_R_32F));
     check_cuda_error(cusparseCreateDnVec(&(persistent_data->vecv), N, persistent_data->d_v, CUDA_R_32F));
@@ -302,7 +293,7 @@ INIT_LINEAR_SYSTEM(init_gpu_biconjugate_gradient) {
     check_cuda_error(cusparseSpMV_bufferSize(persistent_data->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
                                              persistent_data->matA, persistent_data->vecx, &beta, persistent_data->vecAx,
                                              CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, &(persistent_data->bufferSizeMV)));
-    
+
     check_cuda_error(cudaMalloc(&(persistent_data->d_bufferMV), persistent_data->bufferSizeMV));
 
     free(rhs);
@@ -368,7 +359,7 @@ SOLVE_LINEAR_SYSTEM(gpu_biconjugate_gradient) {
         check_cublas_error(cusparseSpMV(persistent_data->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one,
                                         persistent_data->matA, persistent_data->vecp, &zero, persistent_data->vecv,
                                         CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, persistent_data->d_bufferMV));
-        
+
         check_cublas_error(cublasSdot(persistent_data->cublasHandle, N, persistent_data->d_rw, 1, persistent_data->d_v, 1, &temp));
         alpha = rho / temp;
         negalpha = -(alpha);

diff --git a/src/monodomain/monodomain_solver.c b/src/monodomain/monodomain_solver.c
@@ -89,6 +89,9 @@ int solve_monodomain(struct monodomain_solver *the_monodomain_solver, struct ode
     real_cpu finalT = the_monodomain_solver->final_time;
     real_cpu dt_ode = the_ode_solver->min_dt;
 
+    float *linear_system_solver_result = NULL;
+
+
     size_t num_stims = shlen(stimuli_configs);
     if(num_stims) {
         // Init all stimuli
@@ -624,6 +627,18 @@ int solve_monodomain(struct monodomain_solver *the_monodomain_solver, struct ode
 
     log_info("Starting simulation\n");
 
+    bool edp_gpu = false;
+    GET_PARAMETER_BOOLEAN_VALUE_OR_USE_DEFAULT(edp_gpu, linear_system_solver_config, "use_gpu");
+
+    float *gpu_rhs = NULL;
+
+    if(edp_gpu) {
+#ifdef COMPILE_CUDA
+        cudaMalloc((void**)&the_grid->gpu_rhs, sizeof(float)*original_num_cells);
+        gpu_rhs = (float*)malloc(sizeof(float)*original_num_cells);
+#endif
+    }
+
     // Main simulation loop start
     while(cur_time-finalT <= dt_pde) {
 
@@ -659,6 +674,18 @@ int solve_monodomain(struct monodomain_solver *the_monodomain_solver, struct ode
 
         if(save_to_file && (count % print_rate == 0) && (cur_time >= start_saving_after_dt)) {
             start_stop_watch(&write_time);
+
+            if(edp_gpu) {
+
+#ifdef COMPILE_CUDA
+                cudaMemcpy(gpu_rhs, linear_system_solver_result, original_num_cells * sizeof(float), cudaMemcpyDeviceToHost);
+#endif
+                OMP(parallel for)
+                for(uint32_t i = 0; i < original_num_cells; i++) {
+                    the_grid->active_cells[i]->v = gpu_rhs[i];
+                }
+            }
+
             ((save_mesh_fn *)save_mesh_config->main_function)(&time_info, save_mesh_config, the_grid, the_ode_solver, the_purkinje_ode_solver);
             total_write_time += stop_stop_watch(&write_time);
         }
@@ -670,7 +697,9 @@ int solve_monodomain(struct monodomain_solver *the_monodomain_solver, struct ode
         }
 
         if(cur_time > 0.0) {
-            activity = update_ode_state_vector_and_check_for_activity(vm_threshold, the_ode_solver, the_purkinje_ode_solver, the_grid);
+            activity = update_ode_state_vector_and_check_for_activity(vm_threshold, the_ode_solver,
+                                                                      the_purkinje_ode_solver, the_grid,
+                                                                      linear_system_solver_result, edp_gpu);
 
             if(abort_on_no_activity && cur_time > last_stimulus_time && cur_time > only_abort_after_dt) {
                 if(!activity) {
@@ -693,7 +722,7 @@ int solve_monodomain(struct monodomain_solver *the_monodomain_solver, struct ode
             // UPDATE: Purkinje
             ((update_monodomain_fn *)update_monodomain_config->main_function)(&time_info, update_monodomain_config, the_grid, the_monodomain_solver,
                                                                               the_grid->purkinje->num_active_purkinje_cells, the_grid->purkinje->purkinje_cells,
-                                                                              the_purkinje_ode_solver, original_num_purkinje_cells);
+                                                                              the_purkinje_ode_solver, original_num_purkinje_cells, edp_gpu);
 
             purkinje_ode_total_time += stop_stop_watch(&purkinje_ode_time);
 
@@ -735,7 +764,7 @@ int solve_monodomain(struct monodomain_solver *the_monodomain_solver, struct ode
             solve_all_volumes_odes(the_ode_solver, cur_time, stimuli_configs, configs->ode_extra_config);
             ((update_monodomain_fn *)update_monodomain_config->main_function)(&time_info, update_monodomain_config, the_grid, the_monodomain_solver,
                                                                               the_grid->num_active_cells, the_grid->active_cells, the_ode_solver,
-                                                                              original_num_cells);
+                                                                              original_num_cells, edp_gpu);
 
             ode_total_time += stop_stop_watch(&ode_time);
 
@@ -753,8 +782,24 @@ int solve_monodomain(struct monodomain_solver *the_monodomain_solver, struct ode
             }
 
             // DIFUSION: Tissue
-            ((linear_system_solver_fn *)linear_system_solver_config->main_function)(
-                &time_info, linear_system_solver_config, the_grid, the_grid->num_active_cells, the_grid->active_cells, &solver_iterations, &solver_error);
+            linear_system_solver_result = ((linear_system_solver_fn *)linear_system_solver_config->main_function)(
+                &time_info, linear_system_solver_config, the_grid, the_grid->num_active_cells, the_grid->active_cells,
+                &solver_iterations, &solver_error);
+
+
+            if(edp_gpu && !the_ode_solver->gpu) {
+
+#ifdef COMPILE_CUDA
+                cudaMemcpy(gpu_rhs, linear_system_solver_result, original_num_cells * sizeof(float), cudaMemcpyDeviceToHost);
+#endif
+                OMP(parallel for)
+                for(uint32_t i = 0; i < original_num_cells; i++) {
+                    the_grid->active_cells[i]->v = gpu_rhs[i];
+                }
+
+            }
+
+
             if(isnan(solver_error)) {
                 log_error("\nSimulation stopped due to NaN on time %lf. This is probably a problem with the cellular model solver.\n.", cur_time);
 #ifdef COMPILE_GUI
@@ -1001,8 +1046,9 @@ void set_spatial_stim(struct time_info *time_info, struct string_voidp_hash_entr
     }
 }
 
-bool update_ode_state_vector_and_check_for_activity(real_cpu vm_threshold, struct ode_solver *the_ode_solver, struct ode_solver *the_purkinje_ode_solver,
-                                                    struct grid *the_grid) {
+bool update_ode_state_vector_and_check_for_activity(real_cpu vm_threshold, struct ode_solver *the_ode_solver,
+                                                    struct ode_solver *the_purkinje_ode_solver,
+                                                    struct grid *the_grid, float *linear_solver_result, bool edp_gpu) {
     bool act = false;
 
     // Tissue section
@@ -1017,25 +1063,31 @@ bool update_ode_state_vector_and_check_for_activity(real_cpu vm_threshold, struc
         if(the_ode_solver->gpu) {
 #ifdef COMPILE_CUDA
             uint32_t max_number_of_cells = the_ode_solver->original_num_cells;
-            real *vms;
             size_t mem_size = max_number_of_cells * sizeof(real);
+            if(!edp_gpu) {
+                real *vms;
 
-            vms = (real *)malloc(mem_size);
+                vms = (real *)malloc(mem_size);
 
-            if(the_grid->adaptive)
-                check_cuda_error(cudaMemcpy(vms, sv, mem_size, cudaMemcpyDeviceToHost));
+                if(the_grid->adaptive)
+                    check_cuda_error(cudaMemcpy(vms, sv, mem_size, cudaMemcpyDeviceToHost));
 
-            OMP(parallel for)
-            for(uint32_t i = 0; i < n_active; i++) {
-                vms[ac[i]->sv_position] = (real)ac[i]->v;
+                OMP(parallel for)
+                    for(uint32_t i = 0; i < n_active; i++) {
+                        vms[ac[i]->sv_position] = (real)ac[i]->v;
 
-                if(ac[i]->v > vm_threshold) {
-                    act = true;
-                }
-            }
+                        if(ac[i]->v > vm_threshold) {
+                            act = true;
+                        }
+                    }
 
-            check_cuda_error(cudaMemcpy(sv, vms, mem_size, cudaMemcpyHostToDevice));
-            free(vms);
+                check_cuda_error(cudaMemcpy(sv, vms, mem_size, cudaMemcpyHostToDevice));
+                free(vms);
+            } else {
+                check_cuda_error(cudaMemcpy(sv, linear_solver_result, mem_size, cudaMemcpyDeviceToDevice));
+                //TODO: make a kernel to check for activity
+                act = true;
+            }
 #endif
         } else {
             OMP(parallel for)
@@ -1596,7 +1648,7 @@ void write_pmj_delay (struct grid *the_grid, struct config *config, struct termi
                         min_tissue_lat = activation_times_array_tissue[cur_pulse];
                         min_tissue_lat_id = tissue_cells[j]->sv_position;
                     }
-                        
+
                 }
 
                 if (is_terminal_active) {