Skip to content

Commit

Permalink
WIP: avoid copying from gpu to cpu (and vice-versa) when solving ever…
Browse files Browse the repository at this point in the history
…ything on the GPU
  • Loading branch information
rsachetto committed Nov 24, 2023
1 parent a95a918 commit a20c4fd
Show file tree
Hide file tree
Showing 9 changed files with 156 additions and 78 deletions.
3 changes: 3 additions & 0 deletions src/alg/grid/grid.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ struct grid {

struct point_3d start_discretization;
struct point_3d max_discretization;

float *gpu_rhs;

};

struct grid* new_grid();
Expand Down
7 changes: 4 additions & 3 deletions src/config/linear_system_solver_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
#include "../alg/grid/grid.h"
#include "../monodomain/constants.h"
#include "config_common.h"
#include "../ode_solver/ode_solver.h"

#define SOLVE_LINEAR_SYSTEM(name) \
void name(struct time_info *time_info, struct config *config, struct grid *the_grid, \
uint32_t num_active_cells, struct cell_node **active_cells, \
#define SOLVE_LINEAR_SYSTEM(name) \
float *name(struct time_info *time_info, struct config *config, struct grid *the_grid, \
uint32_t num_active_cells, struct cell_node **active_cells, \
uint32_t *number_of_iterations, real_cpu *error)
typedef SOLVE_LINEAR_SYSTEM(linear_system_solver_fn);

Expand Down
2 changes: 1 addition & 1 deletion src/config/update_monodomain_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ struct monodomain_solver;
void name(struct time_info *time_info, struct config *config, struct grid *the_grid, \
struct monodomain_solver *the_solver, \
uint32_t num_active_cells, struct cell_node ** active_cells, \
struct ode_solver *the_ode_solver, uint32_t initial_number_of_cells)
struct ode_solver *the_ode_solver, uint32_t initial_number_of_cells, bool edp_gpu)


typedef UPDATE_MONODOMAIN(update_monodomain_fn);
Expand Down
24 changes: 18 additions & 6 deletions src/gpu_utils/gpu_utils.cu
Original file line number Diff line number Diff line change
@@ -1,20 +1,32 @@
#include "gpu_utils.h"

__global__ void gpu_ecg_integral_kernel(const real *beta_im, const real* distances, const real *volumes, int n, real *result);
__global__ void kernel_gpu_vec_div_vec(real *vec1, real *vec2, real *vec3, size_t n);
__global__ void kernel_gpu_vec_div_vec(real *vec1, real *vec2, real *vec3, size_t n) {

unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;

if(i < n) {
vec3[i] = vec1[i]/vec2[i];
}
}

extern "C" void gpu_vec_div_vec(real *vec1, real *vec2, real *res, size_t n) {
extern "C" void gpu_vec_div_vec(real *vec1, real *vec2, real *res, size_t n) {
const int GRID = (n + BLOCK_SIZE - 1)/BLOCK_SIZE;
kernel_gpu_vec_div_vec<<<GRID, BLOCK_SIZE>>>(vec1, vec2, res, n);
cudaDeviceSynchronize();
}

__global__ void kernel_gpu_vec_div_vec(real *vec1, real *vec2, real *vec3, size_t n) {


__global__ void kernel_update_monodomain(real *vm, float *b, real alpha, size_t n) {

unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;

if(i < n) {
vec3[i] = vec1[i]/vec2[i];
b[i] = vm[i] * alpha;
}
}

extern "C" void gpu_update_monodomain(real *vm, float *b, real alpha, size_t n) {
const int GRID = (n + BLOCK_SIZE - 1)/BLOCK_SIZE;
kernel_update_monodomain<<<GRID, BLOCK_SIZE>>>(vm, b, alpha, n);
cudaDeviceSynchronize();
}
1 change: 1 addition & 0 deletions src/gpu_utils/gpu_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ extern "C" {
#endif
void gpu_vec_div_vec(real *vec1, real *vec2, real* res, size_t n);
real gpu_ecg_integral(real *beta_im, real *distances, real *volumes, size_t vec_size);
void gpu_update_monodomain(real *vm, float *b, real alpha, size_t n);
void cuda_assert(int code, char const *const func, const char *const file, int const line, const char *api);
#ifdef __cplusplus
}
Expand Down
45 changes: 18 additions & 27 deletions src/linear_system_solver_library/gpu_solvers_cublas_12.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@ struct gpu_persistent_data {
float *d_valsILU0;
float *d_zm1, *d_zm2, *d_rm2;
float *d_y;

int bufferSizeLU;
size_t bufferSizeMV, bufferSizeL, bufferSizeU;
void* d_bufferLU, *d_bufferMV, *d_bufferL, *d_bufferU;
cusparseSpSVDescr_t spsvDescrL, spsvDescrU;
cusparseMatDescr_t matLU;
csrilu02Info_t infoILU;

cusparseSpMatDescr_t matM_lower, matM_upper;
cusparseSpMatDescr_t matM_lower, matM_upper;

};

Expand Down Expand Up @@ -82,7 +82,7 @@ INIT_LINEAR_SYSTEM(init_gpu_conjugate_gradient) {
check_cuda_error(cudaMalloc((void **)&(persistent_data->d_row), (N + 1) * sizeof(int)));
check_cuda_error(cudaMalloc((void **)&(persistent_data->d_val), nz * sizeof(float)));
check_cuda_error(cudaMalloc((void **)&(persistent_data->d_x), N * sizeof(float)));
check_cuda_error(cudaMalloc((void **)&(persistent_data->d_r), N * sizeof(float)));
//check_cuda_error(cudaMalloc((void **)&(persistent_data->d_r), N * sizeof(float)));
check_cuda_error(cudaMalloc((void **)&(persistent_data->d_p), N * sizeof(float)));
check_cuda_error(cudaMalloc((void **)&(persistent_data->d_Ax), N * sizeof(float)));

Expand All @@ -97,6 +97,8 @@ INIT_LINEAR_SYSTEM(init_gpu_conjugate_gradient) {
cudaMemcpy(persistent_data->d_val, val, nz * sizeof(float), cudaMemcpyHostToDevice); // A
float *rhs = (float *)malloc(sizeof(float) * num_active_cells);


//TODO:can we remove this?
OMP(parallel for)
for(uint32_t i = 0; i < num_active_cells; i++) {
rhs[i] = active_cells[i]->b;
Expand All @@ -106,7 +108,7 @@ INIT_LINEAR_SYSTEM(init_gpu_conjugate_gradient) {

float alpha = 1.0f;
float beta = 0.0f;

check_cuda_error(cusparseSpMV_bufferSize(persistent_data->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, persistent_data->matA, persistent_data->vecx, &beta, persistent_data->vecAx,
CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, &(persistent_data->bufferSizeMV)));
check_cuda_error(cudaMalloc(&(persistent_data->d_bufferMV), persistent_data->bufferSizeMV));
Expand Down Expand Up @@ -136,7 +138,7 @@ END_LINEAR_SYSTEM(end_gpu_conjugate_gradient) {
check_cuda_error(cudaFree(persistent_data->d_row));
check_cuda_error(cudaFree(persistent_data->d_val));
check_cuda_error(cudaFree(persistent_data->d_x));
check_cuda_error(cudaFree(persistent_data->d_r));
//check_cuda_error(cudaFree(persistent_data->d_r));
check_cuda_error(cudaFree(persistent_data->d_p));
check_cuda_error(cudaFree(persistent_data->d_Ax));
check_cuda_error(cudaFree(persistent_data->d_y));
Expand All @@ -163,22 +165,16 @@ SOLVE_LINEAR_SYSTEM(gpu_conjugate_gradient) {
int k;
float alpha, alpham1, beta;

float *rhs; // Vector B
rhs = (float *)malloc(sizeof(float) * num_active_cells);

OMP(parallel for)
for(uint32_t i = 0; i < num_active_cells; i++) {
rhs[i] = active_cells[i]->b;
}

int N = persistent_data->N;
int N = persistent_data->N;

cudaMemcpy(persistent_data->d_r, rhs, N * sizeof(float), cudaMemcpyHostToDevice); // B

alpha = 1.0f;
alpham1 = -1.0f;
beta = 0.0f;
r0 = 0.0f;
r0 = 0.0f;

persistent_data->d_r = the_grid->gpu_rhs;

check_cuda_error(cusparseSpMV(persistent_data->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, persistent_data->matA, persistent_data->vecx, &beta, persistent_data->vecAx, CUDA_R_32F,
CUSPARSE_SPMV_ALG_DEFAULT, persistent_data->d_bufferMV));
Expand All @@ -194,14 +190,14 @@ SOLVE_LINEAR_SYSTEM(gpu_conjugate_gradient) {
b = r1 / r0;
cublasSscal(persistent_data->cublasHandle, N, &b, persistent_data->d_p, 1);
cublasSaxpy(persistent_data->cublasHandle, N, &alpha, persistent_data->d_r, 1, persistent_data->d_p, 1);

} else {
cublasScopy(persistent_data->cublasHandle, N, persistent_data->d_r, 1, persistent_data->d_p, 1);
}

check_cuda_error(cusparseSpMV(persistent_data->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, persistent_data->matA, persistent_data->vecp, &beta, persistent_data->vecAx, CUDA_R_32F,
CUSPARSE_SPMV_ALG_DEFAULT, persistent_data->d_bufferMV));

cublasSdot(persistent_data->cublasHandle, N, persistent_data->d_p, 1, persistent_data->d_Ax, 1, &dot);

cublasSdot(persistent_data->cublasHandle, N, persistent_data->d_p, 1, persistent_data->d_Ax, 1, &dot);
Expand All @@ -219,17 +215,12 @@ SOLVE_LINEAR_SYSTEM(gpu_conjugate_gradient) {
k++;
}

cudaMemcpy(rhs, persistent_data->d_x, N * sizeof(float), cudaMemcpyDeviceToHost);

*number_of_iterations = k - 1;
*error = r1;

OMP(parallel for)
for(uint32_t i = 0; i < num_active_cells; i++) {
active_cells[i]->v = rhs[i];
}

free(rhs);
return persistent_data->d_x;

}


Expand Down Expand Up @@ -291,7 +282,7 @@ INIT_LINEAR_SYSTEM(init_gpu_biconjugate_gradient) {

check_cuda_error(cusparseCreateCsr(&(persistent_data->matA), N, N, nz, persistent_data->d_row, persistent_data->d_col, persistent_data->d_val,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));

check_cuda_error(cusparseCreateDnVec(&(persistent_data->vecx), N, persistent_data->d_x, CUDA_R_32F));
check_cuda_error(cusparseCreateDnVec(&(persistent_data->vecp), N, persistent_data->d_p, CUDA_R_32F));
check_cuda_error(cusparseCreateDnVec(&(persistent_data->vecv), N, persistent_data->d_v, CUDA_R_32F));
Expand All @@ -302,7 +293,7 @@ INIT_LINEAR_SYSTEM(init_gpu_biconjugate_gradient) {
check_cuda_error(cusparseSpMV_bufferSize(persistent_data->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
persistent_data->matA, persistent_data->vecx, &beta, persistent_data->vecAx,
CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, &(persistent_data->bufferSizeMV)));

check_cuda_error(cudaMalloc(&(persistent_data->d_bufferMV), persistent_data->bufferSizeMV));

free(rhs);
Expand Down Expand Up @@ -368,7 +359,7 @@ SOLVE_LINEAR_SYSTEM(gpu_biconjugate_gradient) {
check_cublas_error(cusparseSpMV(persistent_data->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one,
persistent_data->matA, persistent_data->vecp, &zero, persistent_data->vecv,
CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, persistent_data->d_bufferMV));

check_cublas_error(cublasSdot(persistent_data->cublasHandle, N, persistent_data->d_rw, 1, persistent_data->d_v, 1, &temp));
alpha = rho / temp;
negalpha = -(alpha);
Expand Down
94 changes: 73 additions & 21 deletions src/monodomain/monodomain_solver.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ int solve_monodomain(struct monodomain_solver *the_monodomain_solver, struct ode
real_cpu finalT = the_monodomain_solver->final_time;
real_cpu dt_ode = the_ode_solver->min_dt;

float *linear_system_solver_result = NULL;


size_t num_stims = shlen(stimuli_configs);
if(num_stims) {
// Init all stimuli
Expand Down Expand Up @@ -624,6 +627,18 @@ int solve_monodomain(struct monodomain_solver *the_monodomain_solver, struct ode

log_info("Starting simulation\n");

bool edp_gpu = false;
GET_PARAMETER_BOOLEAN_VALUE_OR_USE_DEFAULT(edp_gpu, linear_system_solver_config, "use_gpu");

float *gpu_rhs = NULL;

if(edp_gpu) {
#ifdef COMPILE_CUDA
cudaMalloc((void**)&the_grid->gpu_rhs, sizeof(float)*original_num_cells);
gpu_rhs = (float*)malloc(sizeof(float)*original_num_cells);
#endif
}

// Main simulation loop start
while(cur_time-finalT <= dt_pde) {

Expand Down Expand Up @@ -659,6 +674,18 @@ int solve_monodomain(struct monodomain_solver *the_monodomain_solver, struct ode

if(save_to_file && (count % print_rate == 0) && (cur_time >= start_saving_after_dt)) {
start_stop_watch(&write_time);

if(edp_gpu) {

#ifdef COMPILE_CUDA
cudaMemcpy(gpu_rhs, linear_system_solver_result, original_num_cells * sizeof(float), cudaMemcpyDeviceToHost);
#endif
OMP(parallel for)
for(uint32_t i = 0; i < original_num_cells; i++) {
the_grid->active_cells[i]->v = gpu_rhs[i];
}
}

((save_mesh_fn *)save_mesh_config->main_function)(&time_info, save_mesh_config, the_grid, the_ode_solver, the_purkinje_ode_solver);
total_write_time += stop_stop_watch(&write_time);
}
Expand All @@ -670,7 +697,9 @@ int solve_monodomain(struct monodomain_solver *the_monodomain_solver, struct ode
}

if(cur_time > 0.0) {
activity = update_ode_state_vector_and_check_for_activity(vm_threshold, the_ode_solver, the_purkinje_ode_solver, the_grid);
activity = update_ode_state_vector_and_check_for_activity(vm_threshold, the_ode_solver,
the_purkinje_ode_solver, the_grid,
linear_system_solver_result, edp_gpu);

if(abort_on_no_activity && cur_time > last_stimulus_time && cur_time > only_abort_after_dt) {
if(!activity) {
Expand All @@ -693,7 +722,7 @@ int solve_monodomain(struct monodomain_solver *the_monodomain_solver, struct ode
// UPDATE: Purkinje
((update_monodomain_fn *)update_monodomain_config->main_function)(&time_info, update_monodomain_config, the_grid, the_monodomain_solver,
the_grid->purkinje->num_active_purkinje_cells, the_grid->purkinje->purkinje_cells,
the_purkinje_ode_solver, original_num_purkinje_cells);
the_purkinje_ode_solver, original_num_purkinje_cells, edp_gpu);

purkinje_ode_total_time += stop_stop_watch(&purkinje_ode_time);

Expand Down Expand Up @@ -735,7 +764,7 @@ int solve_monodomain(struct monodomain_solver *the_monodomain_solver, struct ode
solve_all_volumes_odes(the_ode_solver, cur_time, stimuli_configs, configs->ode_extra_config);
((update_monodomain_fn *)update_monodomain_config->main_function)(&time_info, update_monodomain_config, the_grid, the_monodomain_solver,
the_grid->num_active_cells, the_grid->active_cells, the_ode_solver,
original_num_cells);
original_num_cells, edp_gpu);

ode_total_time += stop_stop_watch(&ode_time);

Expand All @@ -753,8 +782,24 @@ int solve_monodomain(struct monodomain_solver *the_monodomain_solver, struct ode
}

// DIFUSION: Tissue
((linear_system_solver_fn *)linear_system_solver_config->main_function)(
&time_info, linear_system_solver_config, the_grid, the_grid->num_active_cells, the_grid->active_cells, &solver_iterations, &solver_error);
linear_system_solver_result = ((linear_system_solver_fn *)linear_system_solver_config->main_function)(
&time_info, linear_system_solver_config, the_grid, the_grid->num_active_cells, the_grid->active_cells,
&solver_iterations, &solver_error);


if(edp_gpu && !the_ode_solver->gpu) {

#ifdef COMPILE_CUDA
cudaMemcpy(gpu_rhs, linear_system_solver_result, original_num_cells * sizeof(float), cudaMemcpyDeviceToHost);
#endif
OMP(parallel for)
for(uint32_t i = 0; i < original_num_cells; i++) {
the_grid->active_cells[i]->v = gpu_rhs[i];
}

}


if(isnan(solver_error)) {
log_error("\nSimulation stopped due to NaN on time %lf. This is probably a problem with the cellular model solver.\n.", cur_time);
#ifdef COMPILE_GUI
Expand Down Expand Up @@ -1001,8 +1046,9 @@ void set_spatial_stim(struct time_info *time_info, struct string_voidp_hash_entr
}
}

bool update_ode_state_vector_and_check_for_activity(real_cpu vm_threshold, struct ode_solver *the_ode_solver, struct ode_solver *the_purkinje_ode_solver,
struct grid *the_grid) {
bool update_ode_state_vector_and_check_for_activity(real_cpu vm_threshold, struct ode_solver *the_ode_solver,
struct ode_solver *the_purkinje_ode_solver,
struct grid *the_grid, float *linear_solver_result, bool edp_gpu) {
bool act = false;

// Tissue section
Expand All @@ -1017,25 +1063,31 @@ bool update_ode_state_vector_and_check_for_activity(real_cpu vm_threshold, struc
if(the_ode_solver->gpu) {
#ifdef COMPILE_CUDA
uint32_t max_number_of_cells = the_ode_solver->original_num_cells;
real *vms;
size_t mem_size = max_number_of_cells * sizeof(real);
if(!edp_gpu) {
real *vms;

vms = (real *)malloc(mem_size);
vms = (real *)malloc(mem_size);

if(the_grid->adaptive)
check_cuda_error(cudaMemcpy(vms, sv, mem_size, cudaMemcpyDeviceToHost));
if(the_grid->adaptive)
check_cuda_error(cudaMemcpy(vms, sv, mem_size, cudaMemcpyDeviceToHost));

OMP(parallel for)
for(uint32_t i = 0; i < n_active; i++) {
vms[ac[i]->sv_position] = (real)ac[i]->v;
OMP(parallel for)
for(uint32_t i = 0; i < n_active; i++) {
vms[ac[i]->sv_position] = (real)ac[i]->v;

if(ac[i]->v > vm_threshold) {
act = true;
}
}
if(ac[i]->v > vm_threshold) {
act = true;
}
}

check_cuda_error(cudaMemcpy(sv, vms, mem_size, cudaMemcpyHostToDevice));
free(vms);
check_cuda_error(cudaMemcpy(sv, vms, mem_size, cudaMemcpyHostToDevice));
free(vms);
} else {
check_cuda_error(cudaMemcpy(sv, linear_solver_result, mem_size, cudaMemcpyDeviceToDevice));
//TODO: make a kernel to check for activity
act = true;
}
#endif
} else {
OMP(parallel for)
Expand Down Expand Up @@ -1596,7 +1648,7 @@ void write_pmj_delay (struct grid *the_grid, struct config *config, struct termi
min_tissue_lat = activation_times_array_tissue[cur_pulse];
min_tissue_lat_id = tissue_cells[j]->sv_position;
}

}

if (is_terminal_active) {
Expand Down
Loading

0 comments on commit a20c4fd

Please sign in to comment.