From a20c4fd0d26814941b8b9625f9460c1fad36da81 Mon Sep 17 00:00:00 2001 From: Rafael Sachetto Date: Fri, 24 Nov 2023 13:40:35 -0300 Subject: [PATCH] WIP: avoid copying from gpu to cpu (and vice-versa) when solving everything on the GPU --- src/alg/grid/grid.h | 3 + src/config/linear_system_solver_config.h | 7 +- src/config/update_monodomain_config.h | 2 +- src/gpu_utils/gpu_utils.cu | 24 +++-- src/gpu_utils/gpu_utils.h | 1 + .../gpu_solvers_cublas_12.c | 45 ++++----- src/monodomain/monodomain_solver.c | 94 ++++++++++++++----- src/monodomain/monodomain_solver.h | 2 +- .../update_monodomain.c | 56 +++++++---- 9 files changed, 156 insertions(+), 78 deletions(-) diff --git a/src/alg/grid/grid.h b/src/alg/grid/grid.h index 9e289200..62420361 100644 --- a/src/alg/grid/grid.h +++ b/src/alg/grid/grid.h @@ -39,6 +39,9 @@ struct grid { struct point_3d start_discretization; struct point_3d max_discretization; + + float *gpu_rhs; + }; struct grid* new_grid(); diff --git a/src/config/linear_system_solver_config.h b/src/config/linear_system_solver_config.h index 1562d5f3..01da778c 100644 --- a/src/config/linear_system_solver_config.h +++ b/src/config/linear_system_solver_config.h @@ -8,10 +8,11 @@ #include "../alg/grid/grid.h" #include "../monodomain/constants.h" #include "config_common.h" +#include "../ode_solver/ode_solver.h" -#define SOLVE_LINEAR_SYSTEM(name) \ - void name(struct time_info *time_info, struct config *config, struct grid *the_grid, \ - uint32_t num_active_cells, struct cell_node **active_cells, \ +#define SOLVE_LINEAR_SYSTEM(name) \ + float *name(struct time_info *time_info, struct config *config, struct grid *the_grid, \ + uint32_t num_active_cells, struct cell_node **active_cells, \ uint32_t *number_of_iterations, real_cpu *error) typedef SOLVE_LINEAR_SYSTEM(linear_system_solver_fn); diff --git a/src/config/update_monodomain_config.h b/src/config/update_monodomain_config.h index fcd9ee77..132f07a4 100644 --- a/src/config/update_monodomain_config.h +++ b/src/config/update_monodomain_config.h @@ -16,7 +16,7 @@ struct monodomain_solver; void name(struct time_info *time_info, struct config *config, struct grid *the_grid, \ struct monodomain_solver *the_solver, \ uint32_t num_active_cells, struct cell_node ** active_cells, \ - struct ode_solver *the_ode_solver, uint32_t initial_number_of_cells) + struct ode_solver *the_ode_solver, uint32_t initial_number_of_cells, bool edp_gpu) typedef UPDATE_MONODOMAIN(update_monodomain_fn); diff --git a/src/gpu_utils/gpu_utils.cu b/src/gpu_utils/gpu_utils.cu index d966f879..08ae31ca 100644 --- a/src/gpu_utils/gpu_utils.cu +++ b/src/gpu_utils/gpu_utils.cu @@ -1,20 +1,32 @@ #include "gpu_utils.h" -__global__ void gpu_ecg_integral_kernel(const real *beta_im, const real* distances, const real *volumes, int n, real *result); -__global__ void kernel_gpu_vec_div_vec(real *vec1, real *vec2, real *vec3, size_t n); +__global__ void kernel_gpu_vec_div_vec(real *vec1, real *vec2, real *vec3, size_t n) { + + unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; + + if(i < n) { + vec3[i] = vec1[i]/vec2[i]; + } +} - extern "C" void gpu_vec_div_vec(real *vec1, real *vec2, real *res, size_t n) { +extern "C" void gpu_vec_div_vec(real *vec1, real *vec2, real *res, size_t n) { const int GRID = (n + BLOCK_SIZE - 1)/BLOCK_SIZE; kernel_gpu_vec_div_vec<<>>(vec1, vec2, res, n); cudaDeviceSynchronize(); } -__global__ void kernel_gpu_vec_div_vec(real *vec1, real *vec2, real *vec3, size_t n) { - + +__global__ void kernel_update_monodomain(real *vm, float *b, real alpha, size_t n) { + unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; if(i < n) { - vec3[i] = vec1[i]/vec2[i]; + b[i] = vm[i] * alpha; } } +extern "C" void gpu_update_monodomain(real *vm, float *b, real alpha, size_t n) { + const int GRID = (n + BLOCK_SIZE - 1)/BLOCK_SIZE; + kernel_update_monodomain<<>>(vm, b, alpha, n); + cudaDeviceSynchronize(); +} diff --git a/src/gpu_utils/gpu_utils.h b/src/gpu_utils/gpu_utils.h index 00b3ef59..60068c35 100644 --- a/src/gpu_utils/gpu_utils.h +++ b/src/gpu_utils/gpu_utils.h @@ -35,6 +35,7 @@ extern "C" { #endif void gpu_vec_div_vec(real *vec1, real *vec2, real* res, size_t n); real gpu_ecg_integral(real *beta_im, real *distances, real *volumes, size_t vec_size); +void gpu_update_monodomain(real *vm, float *b, real alpha, size_t n); void cuda_assert(int code, char const *const func, const char *const file, int const line, const char *api); #ifdef __cplusplus } diff --git a/src/linear_system_solver_library/gpu_solvers_cublas_12.c b/src/linear_system_solver_library/gpu_solvers_cublas_12.c index 381c020a..a570e32c 100644 --- a/src/linear_system_solver_library/gpu_solvers_cublas_12.c +++ b/src/linear_system_solver_library/gpu_solvers_cublas_12.c @@ -28,7 +28,7 @@ struct gpu_persistent_data { float *d_valsILU0; float *d_zm1, *d_zm2, *d_rm2; float *d_y; - + int bufferSizeLU; size_t bufferSizeMV, bufferSizeL, bufferSizeU; void* d_bufferLU, *d_bufferMV, *d_bufferL, *d_bufferU; @@ -36,7 +36,7 @@ struct gpu_persistent_data { cusparseMatDescr_t matLU; csrilu02Info_t infoILU; - cusparseSpMatDescr_t matM_lower, matM_upper; + cusparseSpMatDescr_t matM_lower, matM_upper; }; @@ -82,7 +82,7 @@ INIT_LINEAR_SYSTEM(init_gpu_conjugate_gradient) { check_cuda_error(cudaMalloc((void **)&(persistent_data->d_row), (N + 1) * sizeof(int))); check_cuda_error(cudaMalloc((void **)&(persistent_data->d_val), nz * sizeof(float))); check_cuda_error(cudaMalloc((void **)&(persistent_data->d_x), N * sizeof(float))); - check_cuda_error(cudaMalloc((void **)&(persistent_data->d_r), N * sizeof(float))); + //check_cuda_error(cudaMalloc((void **)&(persistent_data->d_r), N * sizeof(float))); check_cuda_error(cudaMalloc((void **)&(persistent_data->d_p), N * sizeof(float))); check_cuda_error(cudaMalloc((void **)&(persistent_data->d_Ax), N * sizeof(float))); @@ -97,6 +97,8 @@ INIT_LINEAR_SYSTEM(init_gpu_conjugate_gradient) { cudaMemcpy(persistent_data->d_val, val, nz * sizeof(float), cudaMemcpyHostToDevice); // A float *rhs = (float *)malloc(sizeof(float) * num_active_cells); + + //TODO:can we remove this? OMP(parallel for) for(uint32_t i = 0; i < num_active_cells; i++) { rhs[i] = active_cells[i]->b; @@ -106,7 +108,7 @@ INIT_LINEAR_SYSTEM(init_gpu_conjugate_gradient) { float alpha = 1.0f; float beta = 0.0f; - + check_cuda_error(cusparseSpMV_bufferSize(persistent_data->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, persistent_data->matA, persistent_data->vecx, &beta, persistent_data->vecAx, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, &(persistent_data->bufferSizeMV))); check_cuda_error(cudaMalloc(&(persistent_data->d_bufferMV), persistent_data->bufferSizeMV)); @@ -136,7 +138,7 @@ END_LINEAR_SYSTEM(end_gpu_conjugate_gradient) { check_cuda_error(cudaFree(persistent_data->d_row)); check_cuda_error(cudaFree(persistent_data->d_val)); check_cuda_error(cudaFree(persistent_data->d_x)); - check_cuda_error(cudaFree(persistent_data->d_r)); + //check_cuda_error(cudaFree(persistent_data->d_r)); check_cuda_error(cudaFree(persistent_data->d_p)); check_cuda_error(cudaFree(persistent_data->d_Ax)); check_cuda_error(cudaFree(persistent_data->d_y)); @@ -163,22 +165,16 @@ SOLVE_LINEAR_SYSTEM(gpu_conjugate_gradient) { int k; float alpha, alpham1, beta; - float *rhs; // Vector B - rhs = (float *)malloc(sizeof(float) * num_active_cells); - OMP(parallel for) - for(uint32_t i = 0; i < num_active_cells; i++) { - rhs[i] = active_cells[i]->b; - } - - int N = persistent_data->N; + int N = persistent_data->N; - cudaMemcpy(persistent_data->d_r, rhs, N * sizeof(float), cudaMemcpyHostToDevice); // B alpha = 1.0f; alpham1 = -1.0f; beta = 0.0f; - r0 = 0.0f; + r0 = 0.0f; + + persistent_data->d_r = the_grid->gpu_rhs; check_cuda_error(cusparseSpMV(persistent_data->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, persistent_data->matA, persistent_data->vecx, &beta, persistent_data->vecAx, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, persistent_data->d_bufferMV)); @@ -194,14 +190,14 @@ SOLVE_LINEAR_SYSTEM(gpu_conjugate_gradient) { b = r1 / r0; cublasSscal(persistent_data->cublasHandle, N, &b, persistent_data->d_p, 1); cublasSaxpy(persistent_data->cublasHandle, N, &alpha, persistent_data->d_r, 1, persistent_data->d_p, 1); - + } else { cublasScopy(persistent_data->cublasHandle, N, persistent_data->d_r, 1, persistent_data->d_p, 1); } check_cuda_error(cusparseSpMV(persistent_data->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, persistent_data->matA, persistent_data->vecp, &beta, persistent_data->vecAx, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, persistent_data->d_bufferMV)); - + cublasSdot(persistent_data->cublasHandle, N, persistent_data->d_p, 1, persistent_data->d_Ax, 1, &dot); cublasSdot(persistent_data->cublasHandle, N, persistent_data->d_p, 1, persistent_data->d_Ax, 1, &dot); @@ -219,17 +215,12 @@ SOLVE_LINEAR_SYSTEM(gpu_conjugate_gradient) { k++; } - cudaMemcpy(rhs, persistent_data->d_x, N * sizeof(float), cudaMemcpyDeviceToHost); - *number_of_iterations = k - 1; *error = r1; - OMP(parallel for) - for(uint32_t i = 0; i < num_active_cells; i++) { - active_cells[i]->v = rhs[i]; - } - free(rhs); + return persistent_data->d_x; + } @@ -291,7 +282,7 @@ INIT_LINEAR_SYSTEM(init_gpu_biconjugate_gradient) { check_cuda_error(cusparseCreateCsr(&(persistent_data->matA), N, N, nz, persistent_data->d_row, persistent_data->d_col, persistent_data->d_val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); - + check_cuda_error(cusparseCreateDnVec(&(persistent_data->vecx), N, persistent_data->d_x, CUDA_R_32F)); check_cuda_error(cusparseCreateDnVec(&(persistent_data->vecp), N, persistent_data->d_p, CUDA_R_32F)); check_cuda_error(cusparseCreateDnVec(&(persistent_data->vecv), N, persistent_data->d_v, CUDA_R_32F)); @@ -302,7 +293,7 @@ INIT_LINEAR_SYSTEM(init_gpu_biconjugate_gradient) { check_cuda_error(cusparseSpMV_bufferSize(persistent_data->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, persistent_data->matA, persistent_data->vecx, &beta, persistent_data->vecAx, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, &(persistent_data->bufferSizeMV))); - + check_cuda_error(cudaMalloc(&(persistent_data->d_bufferMV), persistent_data->bufferSizeMV)); free(rhs); @@ -368,7 +359,7 @@ SOLVE_LINEAR_SYSTEM(gpu_biconjugate_gradient) { check_cublas_error(cusparseSpMV(persistent_data->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, persistent_data->matA, persistent_data->vecp, &zero, persistent_data->vecv, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, persistent_data->d_bufferMV)); - + check_cublas_error(cublasSdot(persistent_data->cublasHandle, N, persistent_data->d_rw, 1, persistent_data->d_v, 1, &temp)); alpha = rho / temp; negalpha = -(alpha); diff --git a/src/monodomain/monodomain_solver.c b/src/monodomain/monodomain_solver.c index 282286fe..1db06551 100644 --- a/src/monodomain/monodomain_solver.c +++ b/src/monodomain/monodomain_solver.c @@ -89,6 +89,9 @@ int solve_monodomain(struct monodomain_solver *the_monodomain_solver, struct ode real_cpu finalT = the_monodomain_solver->final_time; real_cpu dt_ode = the_ode_solver->min_dt; + float *linear_system_solver_result = NULL; + + size_t num_stims = shlen(stimuli_configs); if(num_stims) { // Init all stimuli @@ -624,6 +627,18 @@ int solve_monodomain(struct monodomain_solver *the_monodomain_solver, struct ode log_info("Starting simulation\n"); + bool edp_gpu = false; + GET_PARAMETER_BOOLEAN_VALUE_OR_USE_DEFAULT(edp_gpu, linear_system_solver_config, "use_gpu"); + + float *gpu_rhs = NULL; + + if(edp_gpu) { +#ifdef COMPILE_CUDA + cudaMalloc((void**)&the_grid->gpu_rhs, sizeof(float)*original_num_cells); + gpu_rhs = (float*)malloc(sizeof(float)*original_num_cells); +#endif + } + // Main simulation loop start while(cur_time-finalT <= dt_pde) { @@ -659,6 +674,18 @@ int solve_monodomain(struct monodomain_solver *the_monodomain_solver, struct ode if(save_to_file && (count % print_rate == 0) && (cur_time >= start_saving_after_dt)) { start_stop_watch(&write_time); + + if(edp_gpu) { + +#ifdef COMPILE_CUDA + cudaMemcpy(gpu_rhs, linear_system_solver_result, original_num_cells * sizeof(float), cudaMemcpyDeviceToHost); +#endif + OMP(parallel for) + for(uint32_t i = 0; i < original_num_cells; i++) { + the_grid->active_cells[i]->v = gpu_rhs[i]; + } + } + ((save_mesh_fn *)save_mesh_config->main_function)(&time_info, save_mesh_config, the_grid, the_ode_solver, the_purkinje_ode_solver); total_write_time += stop_stop_watch(&write_time); } @@ -670,7 +697,9 @@ int solve_monodomain(struct monodomain_solver *the_monodomain_solver, struct ode } if(cur_time > 0.0) { - activity = update_ode_state_vector_and_check_for_activity(vm_threshold, the_ode_solver, the_purkinje_ode_solver, the_grid); + activity = update_ode_state_vector_and_check_for_activity(vm_threshold, the_ode_solver, + the_purkinje_ode_solver, the_grid, + linear_system_solver_result, edp_gpu); if(abort_on_no_activity && cur_time > last_stimulus_time && cur_time > only_abort_after_dt) { if(!activity) { @@ -693,7 +722,7 @@ int solve_monodomain(struct monodomain_solver *the_monodomain_solver, struct ode // UPDATE: Purkinje ((update_monodomain_fn *)update_monodomain_config->main_function)(&time_info, update_monodomain_config, the_grid, the_monodomain_solver, the_grid->purkinje->num_active_purkinje_cells, the_grid->purkinje->purkinje_cells, - the_purkinje_ode_solver, original_num_purkinje_cells); + the_purkinje_ode_solver, original_num_purkinje_cells, edp_gpu); purkinje_ode_total_time += stop_stop_watch(&purkinje_ode_time); @@ -735,7 +764,7 @@ int solve_monodomain(struct monodomain_solver *the_monodomain_solver, struct ode solve_all_volumes_odes(the_ode_solver, cur_time, stimuli_configs, configs->ode_extra_config); ((update_monodomain_fn *)update_monodomain_config->main_function)(&time_info, update_monodomain_config, the_grid, the_monodomain_solver, the_grid->num_active_cells, the_grid->active_cells, the_ode_solver, - original_num_cells); + original_num_cells, edp_gpu); ode_total_time += stop_stop_watch(&ode_time); @@ -753,8 +782,24 @@ int solve_monodomain(struct monodomain_solver *the_monodomain_solver, struct ode } // DIFUSION: Tissue - ((linear_system_solver_fn *)linear_system_solver_config->main_function)( - &time_info, linear_system_solver_config, the_grid, the_grid->num_active_cells, the_grid->active_cells, &solver_iterations, &solver_error); + linear_system_solver_result = ((linear_system_solver_fn *)linear_system_solver_config->main_function)( + &time_info, linear_system_solver_config, the_grid, the_grid->num_active_cells, the_grid->active_cells, + &solver_iterations, &solver_error); + + + if(edp_gpu && !the_ode_solver->gpu) { + +#ifdef COMPILE_CUDA + cudaMemcpy(gpu_rhs, linear_system_solver_result, original_num_cells * sizeof(float), cudaMemcpyDeviceToHost); +#endif + OMP(parallel for) + for(uint32_t i = 0; i < original_num_cells; i++) { + the_grid->active_cells[i]->v = gpu_rhs[i]; + } + + } + + if(isnan(solver_error)) { log_error("\nSimulation stopped due to NaN on time %lf. This is probably a problem with the cellular model solver.\n.", cur_time); #ifdef COMPILE_GUI @@ -1001,8 +1046,9 @@ void set_spatial_stim(struct time_info *time_info, struct string_voidp_hash_entr } } -bool update_ode_state_vector_and_check_for_activity(real_cpu vm_threshold, struct ode_solver *the_ode_solver, struct ode_solver *the_purkinje_ode_solver, - struct grid *the_grid) { +bool update_ode_state_vector_and_check_for_activity(real_cpu vm_threshold, struct ode_solver *the_ode_solver, + struct ode_solver *the_purkinje_ode_solver, + struct grid *the_grid, float *linear_solver_result, bool edp_gpu) { bool act = false; // Tissue section @@ -1017,25 +1063,31 @@ bool update_ode_state_vector_and_check_for_activity(real_cpu vm_threshold, struc if(the_ode_solver->gpu) { #ifdef COMPILE_CUDA uint32_t max_number_of_cells = the_ode_solver->original_num_cells; - real *vms; size_t mem_size = max_number_of_cells * sizeof(real); + if(!edp_gpu) { + real *vms; - vms = (real *)malloc(mem_size); + vms = (real *)malloc(mem_size); - if(the_grid->adaptive) - check_cuda_error(cudaMemcpy(vms, sv, mem_size, cudaMemcpyDeviceToHost)); + if(the_grid->adaptive) + check_cuda_error(cudaMemcpy(vms, sv, mem_size, cudaMemcpyDeviceToHost)); - OMP(parallel for) - for(uint32_t i = 0; i < n_active; i++) { - vms[ac[i]->sv_position] = (real)ac[i]->v; + OMP(parallel for) + for(uint32_t i = 0; i < n_active; i++) { + vms[ac[i]->sv_position] = (real)ac[i]->v; - if(ac[i]->v > vm_threshold) { - act = true; - } - } + if(ac[i]->v > vm_threshold) { + act = true; + } + } - check_cuda_error(cudaMemcpy(sv, vms, mem_size, cudaMemcpyHostToDevice)); - free(vms); + check_cuda_error(cudaMemcpy(sv, vms, mem_size, cudaMemcpyHostToDevice)); + free(vms); + } else { + check_cuda_error(cudaMemcpy(sv, linear_solver_result, mem_size, cudaMemcpyDeviceToDevice)); + //TODO: make a kernel to check for activity + act = true; + } #endif } else { OMP(parallel for) @@ -1596,7 +1648,7 @@ void write_pmj_delay (struct grid *the_grid, struct config *config, struct termi min_tissue_lat = activation_times_array_tissue[cur_pulse]; min_tissue_lat_id = tissue_cells[j]->sv_position; } - + } if (is_terminal_active) { diff --git a/src/monodomain/monodomain_solver.h b/src/monodomain/monodomain_solver.h index 1331fe3a..75d77dd1 100644 --- a/src/monodomain/monodomain_solver.h +++ b/src/monodomain/monodomain_solver.h @@ -51,7 +51,7 @@ void set_spatial_stim(struct time_info *time_info, struct string_voidp_hash_entr void configure_monodomain_solver_from_options(struct monodomain_solver *the_monodomain_solver, struct user_options *options); -bool update_ode_state_vector_and_check_for_activity(real_cpu vm_threshold, struct ode_solver *the_ode_solver, struct ode_solver *the_purkinje_ode_solver, struct grid *the_grid); +bool update_ode_state_vector_and_check_for_activity(real_cpu vm_threshold, struct ode_solver *the_ode_solver, struct ode_solver *the_purkinje_ode_solver, struct grid *the_grid, float *linear_system_result, bool do_gpu_copies); void compute_pmj_current_purkinje_to_tissue (struct ode_solver *the_ode_solver, struct grid *the_grid, struct terminal *the_terminals); void compute_pmj_current_tissue_to_purkinje (struct ode_solver *the_purkinje_ode_solver, struct grid *the_grid, struct terminal *the_terminals); diff --git a/src/update_monodomain_library/update_monodomain.c b/src/update_monodomain_library/update_monodomain.c index 0cd73c75..c8a3814c 100644 --- a/src/update_monodomain_library/update_monodomain.c +++ b/src/update_monodomain_library/update_monodomain.c @@ -15,7 +15,6 @@ #include "../gpu_utils/gpu_utils.h" #endif - UPDATE_MONODOMAIN(update_monodomain_default) { real_cpu alpha; @@ -28,31 +27,50 @@ UPDATE_MONODOMAIN(update_monodomain_default) { int n_equations_cell_model = the_ode_solver->model_data.number_of_ode_equations; real *sv = the_ode_solver->sv; - #ifdef COMPILE_CUDA - real *vms = NULL; - size_t mem_size = initial_number_of_cells * sizeof(real); if(use_gpu) { - vms = MALLOC_BYTES(real, mem_size); - check_cuda_error(cudaMemcpy(vms, sv, mem_size, cudaMemcpyDeviceToHost)); - } - #endif - - OMP(parallel for private(alpha)) - for(uint32_t i = 0; i < num_active_cells; i++) { - alpha = ALPHA(beta, cm, dt_pde, active_cells[i]->discretization.x, active_cells[i]->discretization.y, active_cells[i]->discretization.z); + #ifdef COMPILE_CUDA + if(!edp_gpu) { + real *vms = NULL; + size_t mem_size = initial_number_of_cells * sizeof(real); + + vms = MALLOC_BYTES(real, mem_size); + check_cuda_error(cudaMemcpy(vms, sv, mem_size, cudaMemcpyDeviceToHost)); + + OMP(parallel for private(alpha)) + for(uint32_t i = 0; i < num_active_cells; i++) { + alpha = ALPHA(beta, cm, dt_pde, active_cells[i]->discretization.x, active_cells[i]->discretization.y, active_cells[i]->discretization.z); + active_cells[i]->b = vms[active_cells[i]->sv_position] * alpha; + } - if(use_gpu) { + free(vms); + } else { + alpha = ALPHA(beta, cm, dt_pde, active_cells[0]->discretization.x, active_cells[0]->discretization.y, active_cells[0]->discretization.z); + gpu_update_monodomain(sv, the_grid->gpu_rhs, alpha, num_active_cells); + } + #endif + } else { + if(!edp_gpu) { + OMP(parallel for private(alpha)) + for(uint32_t i = 0; i < num_active_cells; i++) { + alpha = ALPHA(beta, cm, dt_pde, active_cells[i]->discretization.x, active_cells[i]->discretization.y, active_cells[i]->discretization.z); + active_cells[i]->b = sv[active_cells[i]->sv_position * n_equations_cell_model] * alpha; + } + } else { #ifdef COMPILE_CUDA - active_cells[i]->b = vms[active_cells[i]->sv_position] * alpha; + float *b = malloc(sizeof(float)*num_active_cells); + + OMP(parallel for private(alpha)) + for(uint32_t i = 0; i < num_active_cells; i++) { + alpha = ALPHA(beta, cm, dt_pde, active_cells[i]->discretization.x, active_cells[i]->discretization.y, active_cells[i]->discretization.z); + b[i] = sv[active_cells[i]->sv_position * n_equations_cell_model] * alpha; + } + + check_cuda_error(cudaMemcpy(the_grid->gpu_rhs, b, sizeof(float)*num_active_cells, cudaMemcpyHostToDevice)); + free(b); #endif - } else { - active_cells[i]->b = sv[active_cells[i]->sv_position * n_equations_cell_model] * alpha; } } - #ifdef COMPILE_CUDA - free(vms); - #endif } #ifdef ENABLE_DDM